diff --git a/data/build_graph.py b/data/build_graph.py index 470e3e3..6359864 100755 --- a/data/build_graph.py +++ b/data/build_graph.py @@ -1,14 +1,14 @@ #!/usr/bin/env python -from fetch import wikidata +from fetch import wikidata, wikimedica from neo4j import GraphDatabase +import json NEO4J_URI = "bolt://localhost:7687" NEO4J_USR = "neo4j" NEO4J_PSW = "test" -# Conection with Neo4j -driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USR, NEO4J_PSW)) +ALIGNEMENT_FILE_PATH = "data/alignment_result.json" @@ -28,62 +28,97 @@ def define_link_from_type(link_id): -# Create indexes -with driver.session() as session: - session.run("CREATE INDEX ON :Disease(id);") - session.run("CREATE INDEX ON :Sign_symsymptoms(id);") - session.run("CREATE INDEX ON :Risk_factor(id);") - -# Get all diseases, links, symptoms -request_disease_links = (wikidata.request(""" - SELECT ?maladie ?maladieLabel ?link ?linkLabel ?signe_symptome ?signe_symptomeLabel - WHERE { - ?maladie ?link ?signe_symptome. - SERVICE wikibase:label { bd:serviceParam wikibase:language "fr"}. - VALUES ?link {wdt:P780 wdt:p1542 wdt:P5642} - } - ORDER BY ?maladie -""" -))['results']['bindings'] - -for link in request_disease_links: - disease_id = link["maladie"]["value"].split("/")[-1] - disease_label = link["maladieLabel"]["value"].lower() - disease_type = "Disease" - - link_id = link["link"]["value"].split("/")[-1] - link_label = link["linkLabel"]["value"].lower() - link_type = "link_"+define_link_from_type(link_id) - - signe_symptome_id = link["signe_symptome"]["value"].split("/")[-1] - signe_symptome_label = link["signe_symptomeLabel"]["value"].split("/")[-1].lower() - signe_symptome_type = define_link_from_type(link_id) - +def create_graph(): + """ + Build and insert graph from wikidata to neo4j + """ + # Create indexes with driver.session() as session: - # add dieadiseases - session.run( - "MERGE (d:" + disease_type + " {id:$disease_id, label:$disease_label})", - disease_id=disease_id, - disease_label=disease_label, - ) + session.run("CREATE INDEX ON :Disease(id);") + session.run("CREATE INDEX ON :Sign_symsymptoms(id);") + session.run("CREATE INDEX ON :Risk_factor(id);") - # add symptoms - session.run( - "MERGE (s:" + signe_symptome_type + " {id:$signe_symptome_id, label:$signe_symptome_label})", - signe_symptome_id=signe_symptome_id, - signe_symptome_label=signe_symptome_label, - ) + # Get all diseases, links, symptoms + request_disease_links = (wikidata.request(""" + SELECT ?maladie ?maladieLabel ?link ?linkLabel ?signe_symptome ?signe_symptomeLabel + WHERE { + ?maladie ?link ?signe_symptome. + SERVICE wikibase:label { bd:serviceParam wikibase:language "fr"}. + VALUES ?link {wdt:P780 wdt:p1542 wdt:P5642} + } + ORDER BY ?maladie + """ + ))['results']['bindings'] - # add link - session.run( - "MATCH (d:" + disease_type + " {id:$disease_id})" - "MATCH (s:" + signe_symptome_type + " {id:$signe_symptome_id})" - "MERGE (d)-[l:" + link_type + " {id:$link_id, label:$link_label}]->(s)", - link_id=link_id, - link_label=link_label, - disease_id=disease_id, - signe_symptome_id=signe_symptome_id - ) + for link in request_disease_links: + disease_id = link["maladie"]["value"].split("/")[-1] + disease_label = link["maladieLabel"]["value"].lower() + disease_type = "Disease" + + link_id = link["link"]["value"].split("/")[-1] + link_label = link["linkLabel"]["value"].lower() + link_type = "link_"+define_link_from_type(link_id) + + signe_symptome_id = link["signe_symptome"]["value"].split("/")[-1] + signe_symptome_label = link["signe_symptomeLabel"]["value"].split("/")[-1].lower() + signe_symptome_type = define_link_from_type(link_id) + + with driver.session() as session: + # add dieadiseases + session.run( + "MERGE (d:" + disease_type + " {id:$disease_id, label:$disease_label})", + disease_id=disease_id, + disease_label=disease_label, + ) + + # add symptoms + session.run( + "MERGE (s:" + signe_symptome_type + " {id:$signe_symptome_id, label:$signe_symptome_label})", + signe_symptome_id=signe_symptome_id, + signe_symptome_label=signe_symptome_label, + ) + + # add link + session.run( + "MATCH (d:" + disease_type + " {id:$disease_id})" + "MATCH (s:" + signe_symptome_type + " {id:$signe_symptome_id})" + "MERGE (d)-[l:" + link_type + " {id:$link_id, label:$link_label}]->(s)", + link_id=link_id, + link_label=link_label, + disease_id=disease_id, + signe_symptome_id=signe_symptome_id + ) + + + +def align_with_wikimedica(): + """ + Align neo4j graph and Wikidata + """ + with open(ALIGNEMENT_FILE_PATH, "r") as align_file: + align = json.loads(align_file.read()) + + for entity in align: + if 'wikidata_id' in entity: + + with driver.session() as session: + wikidata_id = entity['wikidata_id'] + wikidata_page = wikimedica.get_web_page(entity['wikimedica_uri']) + + session.run( + "MATCH (d {id:$wikidata_id})" + "SET d.wikimedia_id = $wikimedica_uri", + wikidata_id=wikidata_id, + wikimedica_uri=wikidata_page, + ) + + + +# Conection with Neo4j +driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USR, NEO4J_PSW)) + +create_graph() +align_with_wikimedica() # Close Neo4j connection driver.close() diff --git a/data/fetch/wikimedica.py b/data/fetch/wikimedica.py index 65d991f..0fdb1fa 100644 --- a/data/fetch/wikimedica.py +++ b/data/fetch/wikimedica.py @@ -1,18 +1,20 @@ from .http import session -import rdflib -def request(page, request): +def request(uri, request): """ - Effectue une requête SPARQL sur une page de WikiMedica + Effectue une requête SPARQL depuis une uri de WikiMedica - :param page: Page de WikiMedica ciblé + :param uri: uri de WikiMedica ciblé :param request: Requête SPARQL appliqué :return: Réponse du point d'accés sous forme d'un tableau de dictionaire """ - data = session.get(page, stream=True) + data = session.get(uri, stream=True) g = rdflib.Graph() g.parse(data.raw) qres = g.query(request) return [row.asdict() for row in qres] + +def get_web_page(uri): + return uri.replace("/Special:ExportRDF", "")