from fetch import wikidata, wikimedica import requests import rdflib import json SIGNES_ET_SYMPTOMES_WIKIMEDICA_PAGE = ( 'http://wikimedi.ca/wiki/Sp%C3%A9cial:' 'Export_RDF/Concept:Signes_et_sympt%C3%B4mes' ) PREFIX_PROPERTY = "http://wikimedi.ca/wiki/Special:URIResolver/Property-3A" USER_AGENT = "WikimedicaDiseaseSearch/0.1 (https://gitlab.com/matteodelabre/wikimedica-disease-search)" session = requests.Session() session.headers = {'User-Agent': USER_AGENT} nb_wikimedica_entity = 0 nb_align_by_property = 0 nb_align_by_name = 0 nb_multi_align_possibility = 0 # get name and URI from WikiMedica request_name_uri = """ SELECT ?name ?wikimedica_uri WHERE { ?el rdfs:label ?name . ?el rdfs:isDefinedBy ?wikimedica_uri } """ entitys_dic = wikimedica.request( SIGNES_ET_SYMPTOMES_WIKIMEDICA_PAGE, request_name_uri ) nb_wikimedica_entity = len(entitys_dic) # Try align entity with Wikidata for entity in entitys_dic: # Pre-traitment on name entity['name'] = (entity['name'].split("(")[0]).strip() name = entity['name'] wikimedica_uri = entity['wikimedica_uri'] print() print("="*10 + " " + name + " " + "="*10) print(wikimedica_uri) # use Wikidata_id property request_prop_Wikidata_id = """ PREFIX property: <{}> SELECT ?wikidata_id WHERE {{ ?el property:Wikidata_id ?wikidata_id . }} """.format(PREFIX_PROPERTY) prop_Wikidata_id = wikimedica.request( wikimedica_uri, request_prop_Wikidata_id ) if len(prop_Wikidata_id) == 1: nb_align_by_property = nb_align_by_property + 1 entity.update(prop_Wikidata_id[0]) print("align with 'property:Wikidata_id'") elif len(prop_Wikidata_id) > 1: print("Attention : il y a plus de 1 lien 'property:wikidata_id'") # use name of entity else: request_search_by_name = """ SELECT ?entity WHERE {{ ?entity rdfs:label ?entityLabel; wdt:P31 ?type. VALUES ?entityLabel {{"{}"@fr "{}"@fr}} VALUES ?type {{wd:Q12136 wd:Q1441305 wd:Q169872}} }} """.format(name.lower(), name.capitalize()) list_wikidata_pages_uri = ( wikidata.request(request_search_by_name) )['results']['bindings'] if len(list_wikidata_pages_uri) == 1: nb_align_by_name = nb_align_by_name + 1 entity['wikidata_id'] = ( list_wikidata_pages_uri[0]['entity']['value'] ).split("/")[-1] print("align with 'name', 1 response") print(entity['wikidata_id']) elif len(list_wikidata_pages_uri) > 1: nb_multi_align_possibility = nb_multi_align_possibility + 1 print("align with 'name', multi response") print(list_wikidata_pages_uri) print() print("nb wikimedica_entity : ", nb_wikimedica_entity) print("nb align by property : ", nb_align_by_property) print("nb align by name : ", nb_align_by_name) print("nb multi align possibility : ", nb_multi_align_possibility) print("nb no align : ", nb_wikimedica_entity - nb_align_by_property - nb_align_by_name) # export result in file with open("./data/alignment_result.json", "w") as result_file: json_content = json.dumps(entitys_dic, sort_keys=True, indent=4) result_file.write(json_content)