openalex: Create endpoint
This commit is contained in:
parent
23c2b4b664
commit
2f8f7f1804
|
@ -3,13 +3,14 @@ import logging
|
||||||
import sys
|
import sys
|
||||||
from os import environ
|
from os import environ
|
||||||
from flask import Flask
|
from flask import Flask
|
||||||
from . import twitch, youtube
|
from . import twitch, youtube, openalex
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
blueprints = {
|
blueprints = {
|
||||||
"twitch": twitch,
|
"twitch": twitch,
|
||||||
"youtube": youtube,
|
"youtube": youtube,
|
||||||
|
"openalex": openalex,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,18 @@
|
||||||
|
from flask import abort, Blueprint
|
||||||
|
from .openalex import APIClient
|
||||||
|
from .feed import construct_rss
|
||||||
|
|
||||||
|
|
||||||
|
def create_blueprint(config):
|
||||||
|
"""Create an OpenAlex endpoint blueprint."""
|
||||||
|
client = APIClient(config["email"])
|
||||||
|
openalex = Blueprint("openalex", __name__)
|
||||||
|
|
||||||
|
@openalex.route("/<string:name>", methods=["GET", "HEAD"])
|
||||||
|
def get(name: str):
|
||||||
|
return (
|
||||||
|
construct_rss(client, name),
|
||||||
|
{"Content-Type": "application/rss+xml"},
|
||||||
|
)
|
||||||
|
|
||||||
|
return openalex
|
|
@ -0,0 +1,41 @@
|
||||||
|
from ..feedformatter import Feed
|
||||||
|
from .openalex import APIClient
|
||||||
|
|
||||||
|
|
||||||
|
def construct_rss(client: APIClient, name: str) -> str:
|
||||||
|
"""
|
||||||
|
Build a RSS stream for an academic author.
|
||||||
|
|
||||||
|
:param client: OpenAlex API client
|
||||||
|
:param name: author display name
|
||||||
|
:returns: RSS stream
|
||||||
|
:raises HTTPException: if one of the requests fail
|
||||||
|
"""
|
||||||
|
author_list = client.author(name)
|
||||||
|
works = client.works(author_list)
|
||||||
|
|
||||||
|
feed = Feed()
|
||||||
|
|
||||||
|
# Set the feed/author level properties
|
||||||
|
feed.feed["title"] = name
|
||||||
|
feed.feed["link"] = "https://example.org"
|
||||||
|
feed.feed["description"] = f"Latest works of {name} from the OpenAlex dataset."
|
||||||
|
feed.feed["author"] = "Feedleware"
|
||||||
|
feed.feed["ttl"] = "30"
|
||||||
|
|
||||||
|
for work in works:
|
||||||
|
feed.items.append({
|
||||||
|
"guid": work["id"],
|
||||||
|
"title": work["title"],
|
||||||
|
"link": work["url"],
|
||||||
|
"description": (
|
||||||
|
"; ".join(work["authors"])
|
||||||
|
+ "<br>"
|
||||||
|
+ work["source"]
|
||||||
|
+ "<br><br>"
|
||||||
|
+ work["description"]
|
||||||
|
),
|
||||||
|
"pubDate": work["date"].timetuple(),
|
||||||
|
})
|
||||||
|
|
||||||
|
return feed.format_rss2_string()
|
|
@ -0,0 +1,156 @@
|
||||||
|
import gzip
|
||||||
|
import http
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import urllib
|
||||||
|
import urllib.request
|
||||||
|
from typing import Any, Iterable, Tuple
|
||||||
|
from cachetools import cached, TTLCache
|
||||||
|
from ..util import send_with_retry, parse_iso_date
|
||||||
|
|
||||||
|
|
||||||
|
HTTPError = urllib.error.HTTPError
|
||||||
|
HTTPRequest = urllib.request.Request
|
||||||
|
HTTPResponse = http.client.HTTPResponse
|
||||||
|
HTTPException = http.client.HTTPException
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class APIClient:
|
||||||
|
"""Client for the OpenAlex API."""
|
||||||
|
|
||||||
|
def __init__(self, email: str = "", retries: int = 3):
|
||||||
|
"""
|
||||||
|
Create an OpenAlex API client.
|
||||||
|
|
||||||
|
See <https://docs.openalex.org> for details.
|
||||||
|
|
||||||
|
:param email: Contact email.
|
||||||
|
:param retries: number of times to retry each request in case of failure
|
||||||
|
"""
|
||||||
|
self.email: str = email
|
||||||
|
self.retries: int = retries
|
||||||
|
|
||||||
|
def _query(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
method: str = "GET",
|
||||||
|
data: Iterable[Tuple[str, str]] = []
|
||||||
|
) -> Any:
|
||||||
|
"""
|
||||||
|
Low-level method to query the API.
|
||||||
|
|
||||||
|
:param url: URL to query
|
||||||
|
:param method: HTTP method to use
|
||||||
|
:param data: payload dictionary to send
|
||||||
|
:returns: JSON data
|
||||||
|
:throws HTTPException: if the query fails
|
||||||
|
"""
|
||||||
|
logger.debug("Querying %s %s %s", method, url, data)
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Accept": "application/json",
|
||||||
|
"Accept-Encoding": "gzip",
|
||||||
|
}
|
||||||
|
|
||||||
|
payload = (
|
||||||
|
*data,
|
||||||
|
("mailto", self.email),
|
||||||
|
)
|
||||||
|
|
||||||
|
request = HTTPRequest(
|
||||||
|
url=f"{url}?{urllib.parse.urlencode(payload)}",
|
||||||
|
headers=headers,
|
||||||
|
method=method,
|
||||||
|
)
|
||||||
|
|
||||||
|
http_response = send_with_retry(request, self.retries)
|
||||||
|
|
||||||
|
if http_response.info().get("Content-Encoding") == "gzip":
|
||||||
|
return json.loads(gzip.decompress(http_response.read()))
|
||||||
|
else:
|
||||||
|
return json.loads(http_response.read())
|
||||||
|
|
||||||
|
@cached(cache=TTLCache(maxsize=1000, ttl=7 * 24 * 60 * 60))
|
||||||
|
def author(self, name: str) -> Any:
|
||||||
|
"""
|
||||||
|
Search for an author by their display name.
|
||||||
|
|
||||||
|
See <https://docs.openalex.org/api-entities/authors> for details.
|
||||||
|
|
||||||
|
:param name: author display name
|
||||||
|
:returns: list of matching author ids
|
||||||
|
:throws HTTPException: if the query fails
|
||||||
|
"""
|
||||||
|
authors_response = self._query(
|
||||||
|
url="https://api.openalex.org/authors",
|
||||||
|
method="GET",
|
||||||
|
data=(
|
||||||
|
("search", name),
|
||||||
|
("per-page", "50"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
return tuple(
|
||||||
|
author["id"].removeprefix("https://openalex.org/")
|
||||||
|
for author in authors_response["results"]
|
||||||
|
)
|
||||||
|
|
||||||
|
@cached(cache=TTLCache(maxsize=1000, ttl=30 * 60))
|
||||||
|
def works(self, author_ids: Iterable[str]) -> Any:
|
||||||
|
"""
|
||||||
|
Get list of most recent works by a set of authors.
|
||||||
|
|
||||||
|
See <https://docs.openalex.org/api-entities/works> for details.
|
||||||
|
|
||||||
|
:param author_ids: set of authors
|
||||||
|
:returns: list of latest works
|
||||||
|
:throws HTTPException: if the query fails
|
||||||
|
"""
|
||||||
|
works_response = self._query(
|
||||||
|
url="https://api.openalex.org/works",
|
||||||
|
method="GET",
|
||||||
|
data=(
|
||||||
|
("filter", f"author.id:{'|'.join(author_ids)}"),
|
||||||
|
("sort", "publication_date:desc"),
|
||||||
|
("per-page", "50"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for work in works_response["results"]:
|
||||||
|
abstract = "No abstract"
|
||||||
|
source = "Unknown"
|
||||||
|
authors = [
|
||||||
|
authorship["author"]["display_name"]
|
||||||
|
for authorship in work["authorships"]
|
||||||
|
]
|
||||||
|
|
||||||
|
if work["abstract_inverted_index"] is not None:
|
||||||
|
abstract_reverse = work["abstract_inverted_index"]
|
||||||
|
abstract_forward = {}
|
||||||
|
|
||||||
|
for word, positions in abstract_reverse.items():
|
||||||
|
for position in positions:
|
||||||
|
abstract_forward[position] = word
|
||||||
|
|
||||||
|
abstract = " ".join(map(
|
||||||
|
lambda item: item[1],
|
||||||
|
sorted(abstract_forward.items()),
|
||||||
|
))
|
||||||
|
|
||||||
|
if work["primary_location"]["source"] is not None:
|
||||||
|
source = work["primary_location"]["source"]["display_name"]
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"id": work["id"],
|
||||||
|
"title": work["title"],
|
||||||
|
"description": abstract,
|
||||||
|
"authors": authors,
|
||||||
|
"source": source,
|
||||||
|
"date": parse_iso_date(work["publication_date"]),
|
||||||
|
"url": work["doi"],
|
||||||
|
})
|
||||||
|
|
||||||
|
return results
|
Loading…
Reference in New Issue