2019-11-27 05:38:13 +00:00
|
|
|
#!/usr/bin/env python
|
2019-11-27 09:16:05 +00:00
|
|
|
from fetch import wikidata, wikimedica, mediawiki_api, wikipedia_pageviews
|
2019-11-27 05:38:13 +00:00
|
|
|
from neo4j import GraphDatabase
|
2019-11-27 06:27:31 +00:00
|
|
|
import json
|
2019-11-27 09:16:05 +00:00
|
|
|
import collections
|
|
|
|
import urllib.parse
|
2019-11-27 05:38:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
NEO4J_URI = "bolt://localhost:7687"
|
|
|
|
NEO4J_USR = "neo4j"
|
|
|
|
NEO4J_PSW = "test"
|
|
|
|
|
2019-11-27 06:27:31 +00:00
|
|
|
ALIGNEMENT_FILE_PATH = "data/alignment_result.json"
|
|
|
|
|
2019-11-27 05:38:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
def define_link_from_type(link_id):
|
|
|
|
"""
|
|
|
|
Define typy of link from ID
|
|
|
|
|
|
|
|
:param link_id: id of link
|
|
|
|
:return: type corresponding
|
|
|
|
"""
|
|
|
|
if link_id in ["P780","P1542"]:
|
|
|
|
return "Sign_symsymptoms"
|
|
|
|
elif link_id == "P5642":
|
|
|
|
return "Risk_factor"
|
|
|
|
else:
|
|
|
|
raise Exception("Error : unknow link id: " + link_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
2019-11-27 09:16:05 +00:00
|
|
|
def get_score_visitor(wikipedia_page_uri):
|
|
|
|
project = wikipedia_page_uri.split("/")[2]
|
|
|
|
article = urllib.parse.unquote(wikipedia_page_uri.split("/")[-1])
|
|
|
|
|
|
|
|
site = mediawiki_api.instanciate(project)
|
|
|
|
project_views = wikipedia_pageviews.get_aggregate(project)
|
|
|
|
canonical = mediawiki_api.article_canonical(site, article)
|
|
|
|
|
|
|
|
article = canonical
|
|
|
|
del canonical
|
|
|
|
|
|
|
|
redirects = mediawiki_api.article_redirects(site, article)
|
|
|
|
total_views = sum(
|
|
|
|
(wikipedia_pageviews.get_article(project, page)
|
|
|
|
for page in redirects + [article]),
|
|
|
|
start=collections.Counter()
|
|
|
|
)
|
|
|
|
|
|
|
|
relative_views = dict((
|
|
|
|
(date, total_view / project_views[date])
|
|
|
|
for date, total_view in total_views.items()
|
|
|
|
))
|
|
|
|
|
|
|
|
mean_views = wikipedia_pageviews.mean(relative_views)
|
|
|
|
smoothed_views = wikipedia_pageviews.smooth(mean_views, 10)
|
|
|
|
return smoothed_views
|
|
|
|
|
|
|
|
|
|
|
|
|
2019-11-27 05:52:21 +00:00
|
|
|
def create_graph():
|
|
|
|
"""
|
|
|
|
Build and insert graph from wikidata to neo4j
|
|
|
|
"""
|
|
|
|
# Create indexes
|
|
|
|
with driver.session() as session:
|
|
|
|
session.run("CREATE INDEX ON :Disease(id);")
|
|
|
|
session.run("CREATE INDEX ON :Sign_symsymptoms(id);")
|
|
|
|
session.run("CREATE INDEX ON :Risk_factor(id);")
|
|
|
|
|
|
|
|
# Get all diseases, links, symptoms
|
|
|
|
request_disease_links = (wikidata.request("""
|
2019-11-27 09:16:05 +00:00
|
|
|
SELECT ?maladie ?maladieLabel ?link ?linkLabel ?signe_symptome ?signe_symptomeLabel ?wikipediaArticle
|
2019-11-27 05:52:21 +00:00
|
|
|
WHERE {
|
|
|
|
?maladie ?link ?signe_symptome.
|
2019-11-27 09:16:05 +00:00
|
|
|
OPTIONAL {
|
|
|
|
?wikipediaArticle schema:about ?maladie;
|
|
|
|
schema:isPartOf <https://fr.wikipedia.org/>.
|
|
|
|
}
|
2019-11-27 05:52:21 +00:00
|
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "fr"}.
|
|
|
|
VALUES ?link {wdt:P780 wdt:p1542 wdt:P5642}
|
|
|
|
}
|
|
|
|
"""
|
|
|
|
))['results']['bindings']
|
|
|
|
|
|
|
|
for link in request_disease_links:
|
|
|
|
disease_id = link["maladie"]["value"].split("/")[-1]
|
|
|
|
disease_label = link["maladieLabel"]["value"].lower()
|
|
|
|
disease_type = "Disease"
|
2019-11-27 09:16:05 +00:00
|
|
|
wikipedia_uri = link["wikipediaArticle"]["value"]
|
|
|
|
weights = list(get_score_visitor(wikipedia_uri))
|
2019-11-27 05:52:21 +00:00
|
|
|
|
|
|
|
link_id = link["link"]["value"].split("/")[-1]
|
|
|
|
link_label = link["linkLabel"]["value"].lower()
|
|
|
|
link_type = "link_"+define_link_from_type(link_id)
|
|
|
|
|
|
|
|
signe_symptome_id = link["signe_symptome"]["value"].split("/")[-1]
|
|
|
|
signe_symptome_label = link["signe_symptomeLabel"]["value"].split("/")[-1].lower()
|
|
|
|
signe_symptome_type = define_link_from_type(link_id)
|
|
|
|
|
2019-11-27 09:16:05 +00:00
|
|
|
|
2019-11-27 05:52:21 +00:00
|
|
|
with driver.session() as session:
|
|
|
|
# add dieadiseases
|
|
|
|
session.run(
|
2019-11-27 09:16:05 +00:00
|
|
|
"MERGE (d:" + disease_type +
|
|
|
|
" {id:$disease_id, label:$disease_label, weights:$weights, wikipedia_uri:$wikipedia_uri})",
|
2019-11-27 05:52:21 +00:00
|
|
|
disease_id=disease_id,
|
|
|
|
disease_label=disease_label,
|
2019-11-27 09:16:05 +00:00
|
|
|
weights=weights,
|
|
|
|
wikipedia_uri=wikipedia_uri
|
2019-11-27 05:52:21 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
# add symptoms
|
|
|
|
session.run(
|
|
|
|
"MERGE (s:" + signe_symptome_type + " {id:$signe_symptome_id, label:$signe_symptome_label})",
|
|
|
|
signe_symptome_id=signe_symptome_id,
|
|
|
|
signe_symptome_label=signe_symptome_label,
|
|
|
|
)
|
|
|
|
|
|
|
|
# add link
|
|
|
|
session.run(
|
|
|
|
"MATCH (d:" + disease_type + " {id:$disease_id})"
|
|
|
|
"MATCH (s:" + signe_symptome_type + " {id:$signe_symptome_id})"
|
|
|
|
"MERGE (d)-[l:" + link_type + " {id:$link_id, label:$link_label}]->(s)",
|
|
|
|
link_id=link_id,
|
|
|
|
link_label=link_label,
|
|
|
|
disease_id=disease_id,
|
|
|
|
signe_symptome_id=signe_symptome_id
|
|
|
|
)
|
2019-11-27 05:38:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
2019-11-27 06:27:31 +00:00
|
|
|
def align_with_wikimedica():
|
|
|
|
"""
|
|
|
|
Align neo4j graph and Wikidata
|
|
|
|
"""
|
|
|
|
with open(ALIGNEMENT_FILE_PATH, "r") as align_file:
|
|
|
|
align = json.loads(align_file.read())
|
|
|
|
|
|
|
|
for entity in align:
|
|
|
|
if 'wikidata_id' in entity:
|
2019-11-27 08:24:01 +00:00
|
|
|
|
2019-11-27 06:27:31 +00:00
|
|
|
with driver.session() as session:
|
2019-11-27 08:24:01 +00:00
|
|
|
wikidata_id = entity['wikidata_id']
|
|
|
|
wikidata_page = wikimedica.get_web_page(entity['wikimedica_uri'])
|
|
|
|
|
2019-11-27 06:27:31 +00:00
|
|
|
session.run(
|
|
|
|
"MATCH (d {id:$wikidata_id})"
|
|
|
|
"SET d.wikimedia_id = $wikimedica_uri",
|
2019-11-27 08:24:01 +00:00
|
|
|
wikidata_id=wikidata_id,
|
|
|
|
wikimedica_uri=wikidata_page,
|
2019-11-27 06:27:31 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
2019-11-27 08:24:01 +00:00
|
|
|
|
2019-11-27 05:52:21 +00:00
|
|
|
# Conection with Neo4j
|
|
|
|
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USR, NEO4J_PSW))
|
2019-11-27 05:38:13 +00:00
|
|
|
|
2019-11-27 05:52:21 +00:00
|
|
|
create_graph()
|
2019-11-27 06:27:31 +00:00
|
|
|
align_with_wikimedica()
|
2019-11-27 05:38:13 +00:00
|
|
|
|
|
|
|
# Close Neo4j connection
|
|
|
|
driver.close()
|