wikimedica-disease-search/data/alignement.py

115 lines
3.3 KiB
Python

from fetch import wikidata, wikimedica
import requests
import rdflib
import json
SIGNES_ET_SYMPTOMES_WIKIMEDICA_PAGE = (
'http://wikimedi.ca/wiki/Sp%C3%A9cial:'
'Export_RDF/Concept:Signes_et_sympt%C3%B4mes'
)
PREFIX_PROPERTY = "http://wikimedi.ca/wiki/Special:URIResolver/Property-3A"
USER_AGENT = "WikimedicaDiseaseSearch/0.1 (https://gitlab.com/matteodelabre/wikimedica-disease-search)"
session = requests.Session()
session.headers = {'User-Agent': USER_AGENT}
nb_wikimedica_entity = 0
nb_align_by_property = 0
nb_align_by_name = 0
nb_multi_align_possibility = 0
# get name and URI from WikiMedica
request_name_uri = """
SELECT ?name ?wikimedica_uri
WHERE {
?el rdfs:label ?name .
?el rdfs:isDefinedBy ?wikimedica_uri
}
"""
entitys_dic = wikimedica.request(
SIGNES_ET_SYMPTOMES_WIKIMEDICA_PAGE,
request_name_uri
)
nb_wikimedica_entity = len(entitys_dic)
# Try align entity with Wikidata
for entity in entitys_dic:
# Pre-traitment on name
entity['name'] = (entity['name'].split("(")[0]).strip()
name = entity['name']
wikimedica_uri = entity['wikimedica_uri']
print()
print("="*10 + " " + name + " " + "="*10)
print(wikimedica_uri)
# use Wikidata_id property
request_prop_Wikidata_id = """
PREFIX property: <{}>
SELECT ?wikidata_id
WHERE {{
?el property:Wikidata_id ?wikidata_id .
}}
""".format(PREFIX_PROPERTY)
prop_Wikidata_id = wikimedica.request(
wikimedica_uri,
request_prop_Wikidata_id
)
if len(prop_Wikidata_id) == 1:
nb_align_by_property = nb_align_by_property + 1
entity.update(prop_Wikidata_id[0])
print("align with 'property:Wikidata_id'")
elif len(prop_Wikidata_id) > 1:
print("Attention : il y a plus de 1 lien 'property:wikidata_id'")
# use name of entity
else:
request_search_by_name = """
SELECT ?entity
WHERE {{
?entity rdfs:label ?entityLabel;
wdt:P31 ?type.
VALUES ?entityLabel {{"{}"@fr "{}"@fr}}
VALUES ?type {{wd:Q12136 wd:Q1441305 wd:Q169872}}
}}
""".format(name.lower(), name.capitalize())
list_wikidata_pages_uri = (
wikidata.request(request_search_by_name)
)['results']['bindings']
if len(list_wikidata_pages_uri) == 1:
nb_align_by_name = nb_align_by_name + 1
entity['wikidata_id'] = (
list_wikidata_pages_uri[0]['entity']['value']
).split("/")[-1]
print("align with 'name', 1 response")
print(entity['wikidata_id'])
elif len(list_wikidata_pages_uri) > 1:
nb_multi_align_possibility = nb_multi_align_possibility + 1
print("align with 'name', multi response")
print(list_wikidata_pages_uri)
print()
print("nb wikimedica_entity : ", nb_wikimedica_entity)
print("nb align by property : ", nb_align_by_property)
print("nb align by name : ", nb_align_by_name)
print("nb multi align possibility : ", nb_multi_align_possibility)
print("nb no align : ", nb_wikimedica_entity -
nb_align_by_property - nb_align_by_name)
# export result in file
with open("./data/alignment_result.json", "w") as result_file:
json_content = json.dumps(entitys_dic, sort_keys=True, indent=4)
result_file.write(json_content)