feedleware/feedleware/openalex/openalex.py

import gzip
import http
import json
import logging
import urllib
import urllib.request
from typing import Any, Iterable, Tuple
from cachetools import cached, TTLCache
from ..util import send_with_retry, parse_iso_date


HTTPError = urllib.error.HTTPError
HTTPRequest = urllib.request.Request
HTTPResponse = http.client.HTTPResponse
HTTPException = http.client.HTTPException
logger = logging.getLogger(__name__)


class APIClient:
    """Client for the OpenAlex API."""

    def __init__(self, email: str = "", retries: int = 3):
        """
        Create an OpenAlex API client.

        See <https://docs.openalex.org> for details.

        :param email: Contact email.
        :param retries: number of times to retry each request in case of failure
        """
        self.email: str = email
        self.retries: int = retries

    def _query(
        self,
        url: str,
        method: str = "GET",
        data: Iterable[Tuple[str, str]] = []
    ) -> Any:
        """
        Low-level method to query the API.

        :param url: URL to query
        :param method: HTTP method to use
        :param data: payload dictionary to send
        :returns: JSON data
        :throws HTTPException: if the query fails
        """
        logger.debug("Querying %s %s %s", method, url, data)

        headers = {
            "Accept": "application/json",
            "Accept-Encoding": "gzip",
        }

        payload = (
            *data,
            ("mailto", self.email),
        )

        request = HTTPRequest(
            url=f"{url}?{urllib.parse.urlencode(payload)}",
            headers=headers,
            method=method,
        )

        http_response = send_with_retry(request, self.retries)

        if http_response.info().get("Content-Encoding") == "gzip":
            return json.loads(gzip.decompress(http_response.read()))
        else:
            return json.loads(http_response.read())

    @cached(cache=TTLCache(maxsize=1000, ttl=7 * 24 * 60 * 60))
    def author(self, name: str) -> Any:
        """
        Search for an author by their display name.

        See <https://docs.openalex.org/api-entities/authors> for details.

        :param name: author display name
        :returns: list of matching author ids
        :throws HTTPException: if the query fails
        """
        authors_response = self._query(
            url="https://api.openalex.org/authors",
            method="GET",
            data=(
                ("search", name),
                ("per-page", "50"),
            ),
        )

        return tuple(
            author["id"].removeprefix("https://openalex.org/")
            for author in authors_response["results"]
        )

    @cached(cache=TTLCache(maxsize=1000, ttl=30 * 60))
    def works(self, author_ids: Iterable[str]) -> Any:
        """
        Get list of most recent works by a set of authors.

        See <https://docs.openalex.org/api-entities/works> for details.

        :param author_ids: set of authors
        :returns: list of latest works
        :throws HTTPException: if the query fails
        """
        works_response = self._query(
            url="https://api.openalex.org/works",
            method="GET",
            data=(
                ("filter", f"author.id:{'|'.join(author_ids)}"),
                ("sort", "publication_date:desc"),
                ("per-page", "50"),
            ),
        )

        results = []

        for work in works_response["results"]:
            abstract = "No abstract"
            source = "Unknown"
            authors = [
                authorship["author"]["display_name"]
                for authorship in work["authorships"]
                if authorship["author"]["display_name"] is not None
            ]

            if work["abstract_inverted_index"] is not None:
                abstract_reverse = work["abstract_inverted_index"]
                abstract_forward = {}

                for word, positions in abstract_reverse.items():
                    for position in positions:
                        abstract_forward[position] = word

                abstract = " ".join(map(
                    lambda item: item[1],
                    sorted(abstract_forward.items()),
                ))

            if work["primary_location"]["source"] is not None:
                source = work["primary_location"]["source"]["display_name"]

            results.append({
                "id": work["id"],
                "title": work["title"],
                "description": abstract,
                "authors": authors,
                "source": source,
                "date": parse_iso_date(work["publication_date"]),
                "url": work["doi"],
            })

        return results