From 965d8536c646266af9fe4197d53c7730b443f3fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20C=C3=A9r=C3=A8s?= Date: Tue, 26 Nov 2019 01:05:31 -0500 Subject: [PATCH 1/2] alignement: Ajout de l'algorithme d'alignement --- data/wikimedica/alignement.py | 131 ++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 data/wikimedica/alignement.py diff --git a/data/wikimedica/alignement.py b/data/wikimedica/alignement.py new file mode 100644 index 0000000..cb807df --- /dev/null +++ b/data/wikimedica/alignement.py @@ -0,0 +1,131 @@ +import requests +import rdflib + +SIGNES_ET_SYMPTOMES_WIKIMEDICA_PAGE = ( + 'http://wikimedi.ca/wiki/Sp%C3%A9cial:' + 'Export_RDF/Concept:Signes_et_sympt%C3%B4mes' +) + +PREFIX_PROPERTY = "http://wikimedi.ca/wiki/Special:URIResolver/Property-3A" + +WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql" + +USER_AGENT = "WikimedicaDiseaseSearch/0.1 (https://gitlab.com/matteodelabre/wikimedica-disease-search)" + +session = requests.Session() +session.headers = {'User-Agent': USER_AGENT} + +def make_sparql_request_wikimedica(page, request): + """ + Effectue une requête SPARQL sur une page de WikiMedica + + :param page: Page de WikiMedica ciblé + :param request: Requête SPARQL appliqué + """ + data = session.get(page, stream=True) + + g = rdflib.Graph() + g.parse(data.raw) + qres = g.query(request) + + return [row.asdict() for row in qres] + + +def make_sparql_request_endpoint(endpoint, request): + """ + Effectue une requête SPARQL sur point d'accés + + :param page: Adresse du point d'accés + :param request: Requête SPARQL appliqué + """ + res = session.get(endpoint, params={'format': 'json' ,'query': request}) + + if res.status_code != 200: + raise Exception('Erreur {}'.format(res.status_code)) + + return res.json() + + +nb_wikimedica_entity = 0 +nb_align_by_property = 0 +nb_align_by_name = 0 +nb_align_conflict = 0 + +# get name and URI from WikiMedica +request_name_uri = """ + SELECT ?name ?uri + WHERE { + ?el rdfs:label ?name . + ?el rdfs:isDefinedBy ?uri + } +""" + +entitys_dic = make_sparql_request_wikimedica( + SIGNES_ET_SYMPTOMES_WIKIMEDICA_PAGE, + request_name_uri +) + +nb_wikimedica_entity = len(entitys_dic) + +# Try align entity with Wikidata +for entity in entitys_dic: + print() + print("="*10 + " " + entity['name'] + " " + "="*10) + print(entity['uri']) + + + # use Wikidata_id property + request_prop_Wikidata_id = """ + PREFIX property: <{}> + SELECT ?wikidata_id + WHERE {{ + ?el property:Wikidata_id ?wikidata_id . + }} + """.format(PREFIX_PROPERTY) + + prop_Wikidata_id = make_sparql_request_wikimedica( + entity['uri'], + request_prop_Wikidata_id + ) + + if len(prop_Wikidata_id) == 1: + nb_align_by_property = nb_align_by_property + 1 + entity.update(prop_Wikidata_id[0]) + print("align with 'property:Wikidata_id'") + elif len(prop_Wikidata_id) > 1: + print("Attention : il y a plus de 1 lien 'property:wikidata_id'") + + + # use name of entity + else: + request_search_by_name = """ + SELECT ?entity + WHERE {{ + ?entity rdfs:label ?entityLabel; + wdt:P31 ?type. + VALUES ?entityLabel {{"{}"@fr "{}"@fr}} + VALUES ?type {{wd:Q12136 wd:Q1441305 wd:Q169872}} + }} + """.format((entity['name']).lower(), (entity['name']).capitalize()) + + list_wikidata_pages_uri = (make_sparql_request_endpoint( + WIKIDATA_SPARQL_ENDPOINT, + request_search_by_name + ))['results']['bindings'] + + if len(list_wikidata_pages_uri) == 1: + nb_align_by_name = nb_align_by_name + 1 + wikidata_uri = list_wikidata_pages_uri[0] + entity['wikidata_id'] = rdflib.term.URIRef(wikidata_uri) + print("align with 'name', 1 response") + print(wikidata_uri['entity']['value']) + elif len(list_wikidata_pages_uri) > 1: + nb_align_conflict = nb_align_conflict + 1 + print("align with 'name', multi response") + print(list_wikidata_pages_uri) + +print() +print("nb_wikimedica_entity : ", nb_wikimedica_entity) +print("nb_align_by_property : ", nb_align_by_property) +print("nb_align_by_name : ", nb_align_by_name) +print("nb_align_conflict : ", nb_align_conflict) \ No newline at end of file From 4c094699ee6729e5397484ab8dff4335fa817565 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20C=C3=A9r=C3=A8s?= Date: Tue, 26 Nov 2019 01:19:36 -0500 Subject: [PATCH 2/2] alignement: renomage des nom des fonctions --- data/wikimedica/alignement.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/data/wikimedica/alignement.py b/data/wikimedica/alignement.py index cb807df..231acca 100644 --- a/data/wikimedica/alignement.py +++ b/data/wikimedica/alignement.py @@ -15,7 +15,7 @@ USER_AGENT = "WikimedicaDiseaseSearch/0.1 (https://gitlab.com/matteodelabre/wiki session = requests.Session() session.headers = {'User-Agent': USER_AGENT} -def make_sparql_request_wikimedica(page, request): +def request_wikimedica(page, request): """ Effectue une requête SPARQL sur une page de WikiMedica @@ -31,7 +31,7 @@ def make_sparql_request_wikimedica(page, request): return [row.asdict() for row in qres] -def make_sparql_request_endpoint(endpoint, request): +def request_endpoint(endpoint, request): """ Effectue une requête SPARQL sur point d'accés @@ -60,7 +60,7 @@ request_name_uri = """ } """ -entitys_dic = make_sparql_request_wikimedica( +entitys_dic = request_wikimedica( SIGNES_ET_SYMPTOMES_WIKIMEDICA_PAGE, request_name_uri ) @@ -83,7 +83,7 @@ for entity in entitys_dic: }} """.format(PREFIX_PROPERTY) - prop_Wikidata_id = make_sparql_request_wikimedica( + prop_Wikidata_id = request_wikimedica( entity['uri'], request_prop_Wikidata_id ) @@ -108,7 +108,7 @@ for entity in entitys_dic: }} """.format((entity['name']).lower(), (entity['name']).capitalize()) - list_wikidata_pages_uri = (make_sparql_request_endpoint( + list_wikidata_pages_uri = (request_endpoint( WIKIDATA_SPARQL_ENDPOINT, request_search_by_name ))['results']['bindings']