From a3d55bddafe75ac4d0087f8f0cb4cbd13e4b0e2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20C=C3=A9r=C3=A8s?= Date: Wed, 27 Nov 2019 04:16:05 -0500 Subject: [PATCH] =?UTF-8?q?Int=C3=A9gration=20non=20fonctionelle=20des=20s?= =?UTF-8?q?core=20de=20page?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data/build_graph.py | 48 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/data/build_graph.py b/data/build_graph.py index 6359864..c1fecea 100755 --- a/data/build_graph.py +++ b/data/build_graph.py @@ -1,7 +1,9 @@ #!/usr/bin/env python -from fetch import wikidata, wikimedica +from fetch import wikidata, wikimedica, mediawiki_api, wikipedia_pageviews from neo4j import GraphDatabase import json +import collections +import urllib.parse NEO4J_URI = "bolt://localhost:7687" @@ -28,6 +30,35 @@ def define_link_from_type(link_id): +def get_score_visitor(wikipedia_page_uri): + project = wikipedia_page_uri.split("/")[2] + article = urllib.parse.unquote(wikipedia_page_uri.split("/")[-1]) + + site = mediawiki_api.instanciate(project) + project_views = wikipedia_pageviews.get_aggregate(project) + canonical = mediawiki_api.article_canonical(site, article) + + article = canonical + del canonical + + redirects = mediawiki_api.article_redirects(site, article) + total_views = sum( + (wikipedia_pageviews.get_article(project, page) + for page in redirects + [article]), + start=collections.Counter() + ) + + relative_views = dict(( + (date, total_view / project_views[date]) + for date, total_view in total_views.items() + )) + + mean_views = wikipedia_pageviews.mean(relative_views) + smoothed_views = wikipedia_pageviews.smooth(mean_views, 10) + return smoothed_views + + + def create_graph(): """ Build and insert graph from wikidata to neo4j @@ -40,13 +71,16 @@ def create_graph(): # Get all diseases, links, symptoms request_disease_links = (wikidata.request(""" - SELECT ?maladie ?maladieLabel ?link ?linkLabel ?signe_symptome ?signe_symptomeLabel + SELECT ?maladie ?maladieLabel ?link ?linkLabel ?signe_symptome ?signe_symptomeLabel ?wikipediaArticle WHERE { ?maladie ?link ?signe_symptome. + OPTIONAL { + ?wikipediaArticle schema:about ?maladie; + schema:isPartOf . + } SERVICE wikibase:label { bd:serviceParam wikibase:language "fr"}. VALUES ?link {wdt:P780 wdt:p1542 wdt:P5642} } - ORDER BY ?maladie """ ))['results']['bindings'] @@ -54,6 +88,8 @@ def create_graph(): disease_id = link["maladie"]["value"].split("/")[-1] disease_label = link["maladieLabel"]["value"].lower() disease_type = "Disease" + wikipedia_uri = link["wikipediaArticle"]["value"] + weights = list(get_score_visitor(wikipedia_uri)) link_id = link["link"]["value"].split("/")[-1] link_label = link["linkLabel"]["value"].lower() @@ -63,12 +99,16 @@ def create_graph(): signe_symptome_label = link["signe_symptomeLabel"]["value"].split("/")[-1].lower() signe_symptome_type = define_link_from_type(link_id) + with driver.session() as session: # add dieadiseases session.run( - "MERGE (d:" + disease_type + " {id:$disease_id, label:$disease_label})", + "MERGE (d:" + disease_type + + " {id:$disease_id, label:$disease_label, weights:$weights, wikipedia_uri:$wikipedia_uri})", disease_id=disease_id, disease_label=disease_label, + weights=weights, + wikipedia_uri=wikipedia_uri ) # add symptoms