Intégration du module fetch

This commit is contained in:
Rémi Cérès 2019-11-26 11:19:21 -05:00
parent 4c094699ee
commit fec298b472
5 changed files with 59 additions and 49 deletions

View File

@ -1,3 +1,4 @@
from fetch import wikidata, wikimedica
import requests import requests
import rdflib import rdflib
@ -8,48 +9,16 @@ SIGNES_ET_SYMPTOMES_WIKIMEDICA_PAGE = (
PREFIX_PROPERTY = "http://wikimedi.ca/wiki/Special:URIResolver/Property-3A" PREFIX_PROPERTY = "http://wikimedi.ca/wiki/Special:URIResolver/Property-3A"
WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
USER_AGENT = "WikimedicaDiseaseSearch/0.1 (https://gitlab.com/matteodelabre/wikimedica-disease-search)" USER_AGENT = "WikimedicaDiseaseSearch/0.1 (https://gitlab.com/matteodelabre/wikimedica-disease-search)"
session = requests.Session() session = requests.Session()
session.headers = {'User-Agent': USER_AGENT} session.headers = {'User-Agent': USER_AGENT}
def request_wikimedica(page, request):
"""
Effectue une requête SPARQL sur une page de WikiMedica
:param page: Page de WikiMedica ciblé
:param request: Requête SPARQL appliqué
"""
data = session.get(page, stream=True)
g = rdflib.Graph()
g.parse(data.raw)
qres = g.query(request)
return [row.asdict() for row in qres]
def request_endpoint(endpoint, request):
"""
Effectue une requête SPARQL sur point d'accés
:param page: Adresse du point d'accés
:param request: Requête SPARQL appliqué
"""
res = session.get(endpoint, params={'format': 'json' ,'query': request})
if res.status_code != 200:
raise Exception('Erreur {}'.format(res.status_code))
return res.json()
nb_wikimedica_entity = 0 nb_wikimedica_entity = 0
nb_align_by_property = 0 nb_align_by_property = 0
nb_align_by_name = 0 nb_align_by_name = 0
nb_align_conflict = 0 nb_multi_align_possibility = 0
# get name and URI from WikiMedica # get name and URI from WikiMedica
request_name_uri = """ request_name_uri = """
@ -60,7 +29,7 @@ request_name_uri = """
} }
""" """
entitys_dic = request_wikimedica( entitys_dic = wikimedica.request(
SIGNES_ET_SYMPTOMES_WIKIMEDICA_PAGE, SIGNES_ET_SYMPTOMES_WIKIMEDICA_PAGE,
request_name_uri request_name_uri
) )
@ -69,10 +38,13 @@ nb_wikimedica_entity = len(entitys_dic)
# Try align entity with Wikidata # Try align entity with Wikidata
for entity in entitys_dic: for entity in entitys_dic:
print() # Pre-traitment on name
print("="*10 + " " + entity['name'] + " " + "="*10) entity['name'] = (entity['name'].split("(")[0]).strip()
print(entity['uri']) name = entity['name']
print()
print("="*10 + " " + name + " " + "="*10)
print(entity['uri'])
# use Wikidata_id property # use Wikidata_id property
request_prop_Wikidata_id = """ request_prop_Wikidata_id = """
@ -83,7 +55,7 @@ for entity in entitys_dic:
}} }}
""".format(PREFIX_PROPERTY) """.format(PREFIX_PROPERTY)
prop_Wikidata_id = request_wikimedica( prop_Wikidata_id = wikimedica.request(
entity['uri'], entity['uri'],
request_prop_Wikidata_id request_prop_Wikidata_id
) )
@ -95,7 +67,6 @@ for entity in entitys_dic:
elif len(prop_Wikidata_id) > 1: elif len(prop_Wikidata_id) > 1:
print("Attention : il y a plus de 1 lien 'property:wikidata_id'") print("Attention : il y a plus de 1 lien 'property:wikidata_id'")
# use name of entity # use name of entity
else: else:
request_search_by_name = """ request_search_by_name = """
@ -106,12 +77,11 @@ for entity in entitys_dic:
VALUES ?entityLabel {{"{}"@fr "{}"@fr}} VALUES ?entityLabel {{"{}"@fr "{}"@fr}}
VALUES ?type {{wd:Q12136 wd:Q1441305 wd:Q169872}} VALUES ?type {{wd:Q12136 wd:Q1441305 wd:Q169872}}
}} }}
""".format((entity['name']).lower(), (entity['name']).capitalize()) """.format(name.lower(), name.capitalize())
list_wikidata_pages_uri = (request_endpoint( list_wikidata_pages_uri = (
WIKIDATA_SPARQL_ENDPOINT, wikidata.request(request_search_by_name)
request_search_by_name )['results']['bindings']
))['results']['bindings']
if len(list_wikidata_pages_uri) == 1: if len(list_wikidata_pages_uri) == 1:
nb_align_by_name = nb_align_by_name + 1 nb_align_by_name = nb_align_by_name + 1
@ -120,12 +90,14 @@ for entity in entitys_dic:
print("align with 'name', 1 response") print("align with 'name', 1 response")
print(wikidata_uri['entity']['value']) print(wikidata_uri['entity']['value'])
elif len(list_wikidata_pages_uri) > 1: elif len(list_wikidata_pages_uri) > 1:
nb_align_conflict = nb_align_conflict + 1 nb_multi_align_possibility = nb_multi_align_possibility + 1
print("align with 'name', multi response") print("align with 'name', multi response")
print(list_wikidata_pages_uri) print(list_wikidata_pages_uri)
print() print()
print("nb_wikimedica_entity : ", nb_wikimedica_entity) print("nb wikimedica_entity : ", nb_wikimedica_entity)
print("nb_align_by_property : ", nb_align_by_property) print("nb align by property : ", nb_align_by_property)
print("nb_align_by_name : ", nb_align_by_name) print("nb align by name : ", nb_align_by_name)
print("nb_align_conflict : ", nb_align_conflict) print("nb multi align possibility : ", nb_multi_align_possibility)
print("nb no align : ", nb_wikimedica_entity -
nb_align_by_property - nb_align_by_name)

20
data/fetch/wikidata.py Normal file
View File

@ -0,0 +1,20 @@
from .http import session
WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
def request(request):
"""
Effectue une requête SPARQL sur le point d'accés de Wikidata
:param request: Requête SPARQL appliqué
:return: Réponse en json du point d'accés
"""
res = session.get(
WIKIDATA_SPARQL_ENDPOINT,
params={'format': 'json' ,'query': request}
)
if res.status_code != 200:
raise Exception('Erreur {}'.format(res.status_code))
return res.json()

18
data/fetch/wikimedica.py Normal file
View File

@ -0,0 +1,18 @@
from .http import session
import rdflib
def request(page, request):
"""
Effectue une requête SPARQL sur une page de WikiMedica
:param page: Page de WikiMedica ciblé
:param request: Requête SPARQL appliqué
:return: Réponse du point d'accés sous forme d'un tableau de dictionaire
"""
data = session.get(page, stream=True)
g = rdflib.Graph()
g.parse(data.raw)
qres = g.query(request)
return [row.asdict() for row in qres]