From fec298b472b1a3abd3693e189d9859087f5fd0c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20C=C3=A9r=C3=A8s?= Date: Tue, 26 Nov 2019 11:19:21 -0500 Subject: [PATCH] =?UTF-8?q?Int=C3=A9gration=20du=20module=20fetch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data/{wikimedica => }/alignement.py | 70 ++++++------------ .../fetch/__pycache__/__init__.cpython-38.pyc | Bin 161 -> 159 bytes data/fetch/__pycache__/http.cpython-38.pyc | Bin 537 -> 535 bytes data/fetch/wikidata.py | 20 +++++ data/fetch/wikimedica.py | 18 +++++ 5 files changed, 59 insertions(+), 49 deletions(-) rename data/{wikimedica => }/alignement.py (60%) create mode 100644 data/fetch/wikidata.py create mode 100644 data/fetch/wikimedica.py diff --git a/data/wikimedica/alignement.py b/data/alignement.py similarity index 60% rename from data/wikimedica/alignement.py rename to data/alignement.py index 231acca..77e9e98 100644 --- a/data/wikimedica/alignement.py +++ b/data/alignement.py @@ -1,3 +1,4 @@ +from fetch import wikidata, wikimedica import requests import rdflib @@ -8,48 +9,16 @@ SIGNES_ET_SYMPTOMES_WIKIMEDICA_PAGE = ( PREFIX_PROPERTY = "http://wikimedi.ca/wiki/Special:URIResolver/Property-3A" -WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql" - USER_AGENT = "WikimedicaDiseaseSearch/0.1 (https://gitlab.com/matteodelabre/wikimedica-disease-search)" session = requests.Session() session.headers = {'User-Agent': USER_AGENT} -def request_wikimedica(page, request): - """ - Effectue une requête SPARQL sur une page de WikiMedica - - :param page: Page de WikiMedica ciblé - :param request: Requête SPARQL appliqué - """ - data = session.get(page, stream=True) - - g = rdflib.Graph() - g.parse(data.raw) - qres = g.query(request) - - return [row.asdict() for row in qres] - - -def request_endpoint(endpoint, request): - """ - Effectue une requête SPARQL sur point d'accés - - :param page: Adresse du point d'accés - :param request: Requête SPARQL appliqué - """ - res = session.get(endpoint, params={'format': 'json' ,'query': request}) - - if res.status_code != 200: - raise Exception('Erreur {}'.format(res.status_code)) - - return res.json() - nb_wikimedica_entity = 0 nb_align_by_property = 0 nb_align_by_name = 0 -nb_align_conflict = 0 +nb_multi_align_possibility = 0 # get name and URI from WikiMedica request_name_uri = """ @@ -60,7 +29,7 @@ request_name_uri = """ } """ -entitys_dic = request_wikimedica( +entitys_dic = wikimedica.request( SIGNES_ET_SYMPTOMES_WIKIMEDICA_PAGE, request_name_uri ) @@ -69,10 +38,13 @@ nb_wikimedica_entity = len(entitys_dic) # Try align entity with Wikidata for entity in entitys_dic: - print() - print("="*10 + " " + entity['name'] + " " + "="*10) - print(entity['uri']) + # Pre-traitment on name + entity['name'] = (entity['name'].split("(")[0]).strip() + name = entity['name'] + print() + print("="*10 + " " + name + " " + "="*10) + print(entity['uri']) # use Wikidata_id property request_prop_Wikidata_id = """ @@ -83,7 +55,7 @@ for entity in entitys_dic: }} """.format(PREFIX_PROPERTY) - prop_Wikidata_id = request_wikimedica( + prop_Wikidata_id = wikimedica.request( entity['uri'], request_prop_Wikidata_id ) @@ -95,7 +67,6 @@ for entity in entitys_dic: elif len(prop_Wikidata_id) > 1: print("Attention : il y a plus de 1 lien 'property:wikidata_id'") - # use name of entity else: request_search_by_name = """ @@ -106,12 +77,11 @@ for entity in entitys_dic: VALUES ?entityLabel {{"{}"@fr "{}"@fr}} VALUES ?type {{wd:Q12136 wd:Q1441305 wd:Q169872}} }} - """.format((entity['name']).lower(), (entity['name']).capitalize()) + """.format(name.lower(), name.capitalize()) - list_wikidata_pages_uri = (request_endpoint( - WIKIDATA_SPARQL_ENDPOINT, - request_search_by_name - ))['results']['bindings'] + list_wikidata_pages_uri = ( + wikidata.request(request_search_by_name) + )['results']['bindings'] if len(list_wikidata_pages_uri) == 1: nb_align_by_name = nb_align_by_name + 1 @@ -120,12 +90,14 @@ for entity in entitys_dic: print("align with 'name', 1 response") print(wikidata_uri['entity']['value']) elif len(list_wikidata_pages_uri) > 1: - nb_align_conflict = nb_align_conflict + 1 + nb_multi_align_possibility = nb_multi_align_possibility + 1 print("align with 'name', multi response") print(list_wikidata_pages_uri) print() -print("nb_wikimedica_entity : ", nb_wikimedica_entity) -print("nb_align_by_property : ", nb_align_by_property) -print("nb_align_by_name : ", nb_align_by_name) -print("nb_align_conflict : ", nb_align_conflict) \ No newline at end of file +print("nb wikimedica_entity : ", nb_wikimedica_entity) +print("nb align by property : ", nb_align_by_property) +print("nb align by name : ", nb_align_by_name) +print("nb multi align possibility : ", nb_multi_align_possibility) +print("nb no align : ", nb_wikimedica_entity - + nb_align_by_property - nb_align_by_name) diff --git a/data/fetch/__pycache__/__init__.cpython-38.pyc b/data/fetch/__pycache__/__init__.cpython-38.pyc index cd19e33204b9362ba7959dd4d46d2437298d2963..60cbb67ccd19676ecfaf57047aa0d84898a47583 100644 GIT binary patch delta 33 ncmZ3;IG>R#l$V!_0SH15OysiVcG1ts&rQ`YO3lrj7~%^6f1e2G delta 35 pcmbQwxR8-6l$V!_0SE#%P2{rWb<@wt&rQ|OO)M!%&7T#JlarZbd~0$bV}vLtP)!kt V_~oFVk)NBYUzD1gIr#u%BmkxX6q^75 delta 72 zcmbQvGLwZXl$V!_0SK1u+Q?#JlarZb95p$RF+!XjsHO-+ X{BqLI$j?pH&rK{TNzI?Uk1-Mew^9_r diff --git a/data/fetch/wikidata.py b/data/fetch/wikidata.py new file mode 100644 index 0000000..d140ba0 --- /dev/null +++ b/data/fetch/wikidata.py @@ -0,0 +1,20 @@ +from .http import session + +WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql" + +def request(request): + """ + Effectue une requête SPARQL sur le point d'accés de Wikidata + + :param request: Requête SPARQL appliqué + :return: Réponse en json du point d'accés + """ + res = session.get( + WIKIDATA_SPARQL_ENDPOINT, + params={'format': 'json' ,'query': request} + ) + + if res.status_code != 200: + raise Exception('Erreur {}'.format(res.status_code)) + + return res.json() \ No newline at end of file diff --git a/data/fetch/wikimedica.py b/data/fetch/wikimedica.py new file mode 100644 index 0000000..65d991f --- /dev/null +++ b/data/fetch/wikimedica.py @@ -0,0 +1,18 @@ +from .http import session +import rdflib + +def request(page, request): + """ + Effectue une requête SPARQL sur une page de WikiMedica + + :param page: Page de WikiMedica ciblé + :param request: Requête SPARQL appliqué + :return: Réponse du point d'accés sous forme d'un tableau de dictionaire + """ + data = session.get(page, stream=True) + + g = rdflib.Graph() + g.parse(data.raw) + qres = g.query(request) + + return [row.asdict() for row in qres]