diff --git a/data/wikimedica/alignement.py b/data/wikimedica/alignement.py new file mode 100644 index 0000000..231acca --- /dev/null +++ b/data/wikimedica/alignement.py @@ -0,0 +1,131 @@ +import requests +import rdflib + +SIGNES_ET_SYMPTOMES_WIKIMEDICA_PAGE = ( + 'http://wikimedi.ca/wiki/Sp%C3%A9cial:' + 'Export_RDF/Concept:Signes_et_sympt%C3%B4mes' +) + +PREFIX_PROPERTY = "http://wikimedi.ca/wiki/Special:URIResolver/Property-3A" + +WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql" + +USER_AGENT = "WikimedicaDiseaseSearch/0.1 (https://gitlab.com/matteodelabre/wikimedica-disease-search)" + +session = requests.Session() +session.headers = {'User-Agent': USER_AGENT} + +def request_wikimedica(page, request): + """ + Effectue une requête SPARQL sur une page de WikiMedica + + :param page: Page de WikiMedica ciblé + :param request: Requête SPARQL appliqué + """ + data = session.get(page, stream=True) + + g = rdflib.Graph() + g.parse(data.raw) + qres = g.query(request) + + return [row.asdict() for row in qres] + + +def request_endpoint(endpoint, request): + """ + Effectue une requête SPARQL sur point d'accés + + :param page: Adresse du point d'accés + :param request: Requête SPARQL appliqué + """ + res = session.get(endpoint, params={'format': 'json' ,'query': request}) + + if res.status_code != 200: + raise Exception('Erreur {}'.format(res.status_code)) + + return res.json() + + +nb_wikimedica_entity = 0 +nb_align_by_property = 0 +nb_align_by_name = 0 +nb_align_conflict = 0 + +# get name and URI from WikiMedica +request_name_uri = """ + SELECT ?name ?uri + WHERE { + ?el rdfs:label ?name . + ?el rdfs:isDefinedBy ?uri + } +""" + +entitys_dic = request_wikimedica( + SIGNES_ET_SYMPTOMES_WIKIMEDICA_PAGE, + request_name_uri +) + +nb_wikimedica_entity = len(entitys_dic) + +# Try align entity with Wikidata +for entity in entitys_dic: + print() + print("="*10 + " " + entity['name'] + " " + "="*10) + print(entity['uri']) + + + # use Wikidata_id property + request_prop_Wikidata_id = """ + PREFIX property: <{}> + SELECT ?wikidata_id + WHERE {{ + ?el property:Wikidata_id ?wikidata_id . + }} + """.format(PREFIX_PROPERTY) + + prop_Wikidata_id = request_wikimedica( + entity['uri'], + request_prop_Wikidata_id + ) + + if len(prop_Wikidata_id) == 1: + nb_align_by_property = nb_align_by_property + 1 + entity.update(prop_Wikidata_id[0]) + print("align with 'property:Wikidata_id'") + elif len(prop_Wikidata_id) > 1: + print("Attention : il y a plus de 1 lien 'property:wikidata_id'") + + + # use name of entity + else: + request_search_by_name = """ + SELECT ?entity + WHERE {{ + ?entity rdfs:label ?entityLabel; + wdt:P31 ?type. + VALUES ?entityLabel {{"{}"@fr "{}"@fr}} + VALUES ?type {{wd:Q12136 wd:Q1441305 wd:Q169872}} + }} + """.format((entity['name']).lower(), (entity['name']).capitalize()) + + list_wikidata_pages_uri = (request_endpoint( + WIKIDATA_SPARQL_ENDPOINT, + request_search_by_name + ))['results']['bindings'] + + if len(list_wikidata_pages_uri) == 1: + nb_align_by_name = nb_align_by_name + 1 + wikidata_uri = list_wikidata_pages_uri[0] + entity['wikidata_id'] = rdflib.term.URIRef(wikidata_uri) + print("align with 'name', 1 response") + print(wikidata_uri['entity']['value']) + elif len(list_wikidata_pages_uri) > 1: + nb_align_conflict = nb_align_conflict + 1 + print("align with 'name', multi response") + print(list_wikidata_pages_uri) + +print() +print("nb_wikimedica_entity : ", nb_wikimedica_entity) +print("nb_align_by_property : ", nb_align_by_property) +print("nb_align_by_name : ", nb_align_by_name) +print("nb_align_conflict : ", nb_align_conflict) \ No newline at end of file