import requests import rdflib SIGNES_ET_SYMPTOMES_WIKIMEDICA_PAGE = ( 'http://wikimedi.ca/wiki/Sp%C3%A9cial:' 'Export_RDF/Concept:Signes_et_sympt%C3%B4mes' ) PREFIX_PROPERTY = "http://wikimedi.ca/wiki/Special:URIResolver/Property-3A" WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql" USER_AGENT = "WikimedicaDiseaseSearch/0.1 (https://gitlab.com/matteodelabre/wikimedica-disease-search)" session = requests.Session() session.headers = {'User-Agent': USER_AGENT} def make_sparql_request_wikimedica(page, request): """ Effectue une requête SPARQL sur une page de WikiMedica :param page: Page de WikiMedica ciblé :param request: Requête SPARQL appliqué """ data = session.get(page, stream=True) g = rdflib.Graph() g.parse(data.raw) qres = g.query(request) return [row.asdict() for row in qres] def make_sparql_request_endpoint(endpoint, request): """ Effectue une requête SPARQL sur point d'accés :param page: Adresse du point d'accés :param request: Requête SPARQL appliqué """ res = session.get(endpoint, params={'format': 'json' ,'query': request}) if res.status_code != 200: raise Exception('Erreur {}'.format(res.status_code)) return res.json() nb_wikimedica_entity = 0 nb_align_by_property = 0 nb_align_by_name = 0 nb_align_conflict = 0 # get name and URI from WikiMedica request_name_uri = """ SELECT ?name ?uri WHERE { ?el rdfs:label ?name . ?el rdfs:isDefinedBy ?uri } """ entitys_dic = make_sparql_request_wikimedica( SIGNES_ET_SYMPTOMES_WIKIMEDICA_PAGE, request_name_uri ) nb_wikimedica_entity = len(entitys_dic) # Try align entity with Wikidata for entity in entitys_dic: print() print("="*10 + " " + entity['name'] + " " + "="*10) print(entity['uri']) # use Wikidata_id property request_prop_Wikidata_id = """ PREFIX property: <{}> SELECT ?wikidata_id WHERE {{ ?el property:Wikidata_id ?wikidata_id . }} """.format(PREFIX_PROPERTY) prop_Wikidata_id = make_sparql_request_wikimedica( entity['uri'], request_prop_Wikidata_id ) if len(prop_Wikidata_id) == 1: nb_align_by_property = nb_align_by_property + 1 entity.update(prop_Wikidata_id[0]) print("align with 'property:Wikidata_id'") elif len(prop_Wikidata_id) > 1: print("Attention : il y a plus de 1 lien 'property:wikidata_id'") # use name of entity else: request_search_by_name = """ SELECT ?entity WHERE {{ ?entity rdfs:label ?entityLabel; wdt:P31 ?type. VALUES ?entityLabel {{"{}"@fr "{}"@fr}} VALUES ?type {{wd:Q12136 wd:Q1441305 wd:Q169872}} }} """.format((entity['name']).lower(), (entity['name']).capitalize()) list_wikidata_pages_uri = (make_sparql_request_endpoint( WIKIDATA_SPARQL_ENDPOINT, request_search_by_name ))['results']['bindings'] if len(list_wikidata_pages_uri) == 1: nb_align_by_name = nb_align_by_name + 1 wikidata_uri = list_wikidata_pages_uri[0] entity['wikidata_id'] = rdflib.term.URIRef(wikidata_uri) print("align with 'name', 1 response") print(wikidata_uri['entity']['value']) elif len(list_wikidata_pages_uri) > 1: nb_align_conflict = nb_align_conflict + 1 print("align with 'name', multi response") print(list_wikidata_pages_uri) print() print("nb_wikimedica_entity : ", nb_wikimedica_entity) print("nb_align_by_property : ", nb_align_by_property) print("nb_align_by_name : ", nb_align_by_name) print("nb_align_conflict : ", nb_align_conflict)