From f27f38d1a9a616d66e19b55c97521a996ce31e53 Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Sat, 16 Mar 2024 10:16:09 +0200 Subject: [PATCH 01/11] makefile: simplify targets --- Makefile | 484 ++++--------------------------------------------------- 1 file changed, 27 insertions(+), 457 deletions(-) diff --git a/Makefile b/Makefile index c300533..7e98982 100644 --- a/Makefile +++ b/Makefile @@ -1,460 +1,30 @@ +PYTHON?=python -X dev -# File: common-makefile/src/version.m4 -MAKEFILE_VERSION = v0.0.1-21-g392d792 -MAKEFILE_DATE = 31-07-2017 15:47 -MAKEFILE_AUTHOR = Alejandro Gallo -MAKEFILE_URL = https://github.com/alejandrogallo/python-makefile -MAKEFILE_LICENSE = GPLv3 +all: help - - - -## < /dev/null) -# If messages should have color -WITH_COLOR ?= 1 - -ifneq ($(strip $(QUIET)),0) -FD_OUTPUT = 2>&1 > /dev/null -else -FD_OUTPUT = -endif - -ifdef DEBUG -DBG_FLAG = -DBG_FILE ?= .makefile-dbg -$(shell date | $(SED) "p; s/./=/g" > $(DBG_FILE)) -else -DBG_FLAG = @ -DBG_FILE = -endif - -define log-debug ->> $(or $(DBG_FILE),/dev/null) echo -endef - -# Print commands like [CMD] -define print-cmd-name -"[$(COLOR_LB) \ -$(shell \ - if test "$(1)" = g++; then \ - echo -n GXX; \ - elif test "$(1)" = gcc; then \ - echo -n GCC; \ - elif test "$(1)" = icc; then \ - echo -n ICC; \ - elif test "$(1)" = cc; then \ - echo -n CC; \ - elif test "$(1)" = povray; then \ - echo -n POV; \ - elif test "$(1)" = perl; then \ - echo -n PL; \ - elif test "$(1)" = perl5; then \ - echo -n PL5; \ - elif test "$(1)" = ruby; then \ - echo -n RB; \ - elif test "$(1)" = ruby2; then \ - echo -n RB2; \ - elif test "$(1)" = python; then \ - echo -n PY; \ - elif test "$(1)" = python2; then \ - echo -n PY2; \ - elif test "$(1)" = python3; then \ - echo -n PY3; \ - elif test "$(1)" = pdflatex; then \ - echo -n pdfTeX; \ - elif test "$(1)" = bash; then \ - echo -n BASH; \ - elif test "$(1)" = gnuplot; then \ - echo -n GPT; \ - elif test "$(1)" = mupdf; then \ - echo -n muPDF; \ - else \ - echo -n "$(1)" | tr a-z A-Z ; \ - fi -)\ -$(COLOR_E)]" -endef - -ifndef QQUIET - -ifeq ($(strip $(WITH_COLOR)),1) -# Red -COLOR_R ?= $(if $(TPUT),$(shell $(TPUT) setaf 1),"\033[0;31m") -# Green -COLOR_G ?= $(if $(TPUT),$(shell $(TPUT) setaf 2),"\033[0;32m") -# Yellow -COLOR_Y ?= $(if $(TPUT),$(shell $(TPUT) setaf 3),"\033[0;33m") -# Dark blue -COLOR_DB ?= $(if $(TPUT),$(shell $(TPUT) setaf 4),"\033[0;34m") -# Lila -COLOR_L ?= $(if $(TPUT),$(shell $(TPUT) setaf 5),"\033[0;35m") -# Light blue -COLOR_LB ?= $(if $(TPUT),$(shell $(TPUT) setaf 6),"\033[0;36m") -# Empty color -COLOR_E ?= $(if $(TPUT),$(shell $(TPUT) sgr0),"\033[0m") -ARROW ?= @echo "$(COLOR_L)===>$(COLOR_E)" -else -ARROW ?= @echo "===>" -endif #WITH_COLOR - -ECHO ?= @echo - -else -ARROW := @ > /dev/null echo -ECHO := @ > /dev/null echo -endif #QQUIET - - - - - - -# File: ctags.m4 - - -# ==================================== -# Ctags generation for latex documents -# ==================================== -# -# Generate a tags file so that you can navigate through the tags using -# compatible editors such as emacs or (n)vi(m). -# -tags: ## Create python exhuberant ctags - $(CTAGS) --language-force=python -R * - - - -# File: install.m4 - - -# Old-style requirements file -REQUIREMENTS ?= requirements.txt -# Command to be run when make `install` is run -INSTALL_COMMAND ?= $(PYTHON) setup.py install -# Command to be run when make `install-local` is run -INSTALL_LOCAL_COMMAND ?= $(PYTHON) setup.py install --user -# Command to be run when make `install-dev` is run -INSTALL_DEV_COMMAND ?= $(PYTHON) setup.py develop -# Command to be run when make `install-dev-local` is run -INSTALL_DEV_LOCAL_COMMAND ?= $(PYTHON) setup.py develop --user -# Command to be run when make `uninstall` is run -UNINSTALL_COMMAND ?= $(PIP) uninstall $(shell $(PYTHON) setup.py --name) -# Command to be run when make `install-deps` is run -INSTALL_DEPS_COMMAND ?= $(PIP) install -r requirements.txt -# Command to be run when make `install-deps-local` is run -INSTALL_DEPS_LOCAL_COMMAND ?= $(PIP) install --user -r requirements.txt -install-dev-local: ## Install developement version locally - $(ARROW) Installing development version locally - $(DBG_FLAG)$(INSTALL_DEV_LOCAL_COMMAND) - -install-dev: ## Install developement version - $(ARROW) Installing development version - $(DBG_FLAG)$(INSTALL_DEV_COMMAND) - -install-local: ## Install the package locally - $(ARROW) Installing locally - $(DBG_FLAG)$(INSTALL_LOCAL_COMMAND) - -install: ## Install the package - $(ARROW) Installing... - $(DBG_FLAG)$(INSTALL_COMMAND) - -uninstall: ## Uninstall the package - $(ARROW) Uninstalling... - $(DBG_FLAG)$(UNINSTALL_COMMAND) - -install-deps-local: ## Install python requirements locally - $(ARROW) Installing dependencies... - $(DBG_FLAG)$(INSTALL_DEPS_LOCAL_COMMAND) - -install-deps: ## Install python requirements - $(ARROW) Installing dependencies... - $(DBG_FLAG)$(INSTALL_DEPS_COMMAND) - - - -# File: lint.m4 - - -# Linter program -PY_LINTER ?= flake8 -# ============ -# Check syntax -# ============ -# -# It checks the syntax (lints) of all the tex sources using the program in the -# TEX_LINTER variable. -# -lint: ## Check syntax of sources - $(PY_LINTER) - - - -# File: doc.m4 - - -doc: ## Create documentation - make -C doc/ html - -doc-%: - make -C doc/ $* - -update-gh-pages: ## Update github pages - @echo "Warning: Black magic in action" - git push origin $$(git subtree split --prefix doc/build/html/ master):gh-pages --force - - - - -# File: test.m4 - - -# Command to run for `make test` -TEST_COMMAND ?= $(PYTHON) setup.py test -test: ## Run the tests - $(DBG_FLAG)$(TEST_COMMAND) - - - -# File: virtualenv.m4 - - -ENV ?= -ENV_FOLDER ?= env -ENV_PIP ?= $(ENV_FOLDER)/bin/pip -ENV_PYTHON ?= $(ENV_FOLDER)/bin/python -VIRTUALENV ?= virtualenv - -ifdef ENV -PYTHON = $(ENV_PYTHON) -PIP = $(ENV_PIP) -DEPENDENCIES += virtualenv -DIST_DEPENDENCIES += virtualenv -endif - -virtualenv: $(ENV_FOLDER) ## Create the python virtual environment -$(ENV_FOLDER): - $(ARROW) "Creating virtual environment in '$(ENV_FOLDER)' \ - with python executable '$(PYTHON)'" - $(DBG_FLAG)$(VIRTUALENV) -p $(PYTHON) $(ENV_FOLDER) - - - - -# File: common-makefile/src/update.m4 - - -MAKEFILE_UPDATE_URL ?= https://raw.githubusercontent.com/alejandrogallo/python-makefile/master/dist/Makefile - - -# =============================== -# Update the makefile from source -# =============================== -# -# You can always get the latest `Makefile` version using this target. You may -# override the `MAKEFILE_UPDATE_URL` to any path where you save your own -# personal makefile -# -update: ## Update the makefile from the repository - $(ARROW) "Getting makefile from $(MAKEFILE_UPDATE_URL)" - $(DBG_FLAG)wget $(MAKEFILE_UPDATE_URL) -O Makefile - - - - -# File: common-makefile/src/clean.m4 - - -# Remove command flags -RM_FLAGS ?= -rf - -# Default clean file to be cleaned -DEFAULT_CLEAN_FILES ?= - -# Files to be cleaned -CLEAN_FILES ?= $(DEFAULT_CLEAN_FILES) - -# ============= -# Main cleaning -# ============= -# -# This does a main cleaning of the produced auxiliary files. Before using it -# check which files are going to be cleaned up. -# -clean: ## Remove build and temporary files - $(ARROW) Cleaning up... - $(DBG_FLAG) {\ - for file in $(CLEAN_FILES); do \ - test -e $$file && { \ - $(RM) $(RM_FLAGS) $$file && \ - echo $(call print-cmd-name,RM) "$$file";\ - } || : ; \ - done \ - } - - - - -# File: common-makefile/src/print-variable.m4 - - -# This is used for printing defined variables from Some other scripts. For -# instance if you want to know the value of the `PDF_VIEWER` defined in the -# Makefile, then you would do -# ``` -# make print-PDF_VIEWER -# ``` -# and this would output `PDF_VIEWER=mupdf` for instance. -FORCE: -print-%: - $(DBG_FLAG)echo '$*=$($*)' - -# ===================================== -# Print a variable used by the Makefile -# ===================================== -# -# For debugging purposes it is useful to print out some variables that the -# makefile is using, for that just type `make print` and you will be prompted -# to insert the name of the variable that you want to know. -# -FORCE: -print: ## Print a variable - $(DBG_FLAG)read -p "Variable to print: " variable && \ - $(MAKE) --no-print-directory print-$$variable - - - - -# File: common-makefile/src/help.m4 - - - -# ================ -# Print quick help -# ================ -# -# It prints a quick help in the terminal -help: ## Prints help for targets with comments - $(DBG_FLAG)$(or $(AWK),awk) ' \ - BEGIN {FS = ":.*?## "}; \ - /^## *< Date: Sat, 16 Mar 2024 10:28:40 +0200 Subject: [PATCH 02/11] setup: switch to pyproject and hatchling --- MANIFEST.in | 11 ------- pyproject.toml | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++ setup.cfg | 26 ---------------- setup.py | 60 ------------------------------------ 4 files changed, 82 insertions(+), 97 deletions(-) delete mode 100644 MANIFEST.in create mode 100644 pyproject.toml delete mode 100644 setup.cfg delete mode 100644 setup.py diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 2d4499e..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,11 +0,0 @@ -graft src -include AUTHORS.rst -include CONTRIBUTING.rst -include LICENSE -include README.rst - -recursive-include tests * -recursive-exclude * __pycache__ -recursive-exclude * *.py[co] - -recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1e3bc83 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,82 @@ +[build-system] +build-backend = "hatchling.build" +requires = [ + "hatchling>=1.10", +] + +[project] +name = "python-doi" +version = "0.2.0" +description = "Python package to work with Document Object Identifiers (DOIs)" +readme = "README.rst" +keywords = [ + "doi", +] +license = { text = "GPL-3.0-or-later" } +maintainers = [{ name = "Alejandro Gallo", email = "aamsgallo@gmail.com" }] +authors = [{ name = "Alejandro Gallo", email = "aamsgallo@gmail.com" }] +requires-python = ">=3.8" +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Utilities", +] + +[project.optional-dependencies] +develop = [ + "flake8", + "flake8-bugbear", + "Flake8-pyproject", + "flake8-quotes", + "mypy>=0.7", + "pep8-naming", + "pytest", + "pytest-cov", + "python-coveralls", +] +docs = [ + "sphinx>=4", + "sphinx_rtd_theme>=1", +] + +[project.urls] +Repository = "https://github.com/papis/python-doi" + +[tool.hatch.build.targets.sdist] +exclude = [".github", "docs/build"] + +[tool.hatch.build.targets.wheel] +packages = ["src/doi"] + +[tool.flake8] +select = ["B", "D", "E", "F", "N", "Q", "W"] +extend-ignore = ["B019", "E123", "N818", "W503"] +max-line-length = 88 +inline-quotes = "double" +multiline-quotes = "double" + +[tool.pytest.ini_options] +addopts = [ + "--doctest-modules", + "--cov=src/doi", +] +markers = [ + "net: marks tests that call use the net" +] + +[tool.mypy] +strict = true +show_column_numbers = true +hide_error_codes = false +pretty = true +warn_unused_ignores = false diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index f587ed0..0000000 --- a/setup.cfg +++ /dev/null @@ -1,26 +0,0 @@ -[bdist_wheel] -universal = 1 - -[flake8] -exclude = docs - -[tool:pytest] -markers = - net: marks tests that call use the net (using the URL endpoint, deselect with '-k "not net"') - - -[mypy] -disallow_redefinition = True -warn_unused_configs = True -disallow_any_generics = True -disallow_subclassing_any = True -disallow_untyped_calls = True -disallow_untyped_defs = True -disallow_incomplete_defs = True -check_untyped_defs = True -disallow_untyped_decorators = True -no_implicit_optional = True -warn_redundant_casts = True -warn_unused_ignores = True -warn_return_any = True -no_implicit_reexport = True diff --git a/setup.py b/setup.py deleted file mode 100644 index 25324a7..0000000 --- a/setup.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -"""The setup script.""" - -from setuptools import setup, find_packages - - -def get_version(filename): - """Extract the package version""" - with open(filename) as in_fh: - for line in in_fh: - if line.startswith('__version__'): - return line.split('=')[1].strip()[1:-1] - raise ValueError("Cannot extract version from %s" % filename) - - -with open('README.rst') as readme_file: - readme = readme_file.read() - -requirements = [] - -dev_requirements = [ - 'coverage', 'pytest', 'pytest-cov==2.5.0', 'twine', 'pep8', - 'flake8', 'wheel', 'mypy', - 'sphinx', 'sphinx-autobuild', 'sphinx-autodoc-typehints', - 'sphinx_rtd_theme'] - -version = get_version('./src/doi/__init__.py') - -setup( - author="Alejandro Gallo", - author_email='aamsgallo@gmail.com', - classifiers=[ - 'Development Status :: 2 - Pre-Alpha', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', - 'Natural Language :: English', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - ], - description="Python package to work with Document Object Identifier (doi)", - install_requires=requirements, - extras_require={ - 'dev': dev_requirements, - }, - license="GNU General Public License v3", - long_description=readme, - include_package_data=True, - keywords='doi', - name='python-doi', - package_data={"doi": ["py.typed"]}, - packages=find_packages(where="src"), - package_dir={"": "src"}, - url='https://github.com/papis/python-doi', - version=version, - zip_safe=False, -) From 758318b79cd58dd7c00f0fe1f46672ab8113ab06 Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Sat, 16 Mar 2024 10:28:59 +0200 Subject: [PATCH 03/11] ci: remove travis.yaml --- .travis.yml | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 70589f5..0000000 --- a/.travis.yml +++ /dev/null @@ -1,19 +0,0 @@ -# Config file for automatic testing at travis-ci.org - -language: python -python: - - 3.8 - - 3.7 - - 3.6 - - 3.5 - -install: - - pip install -e .[dev] - - pip install coveralls - -script: - - py.test --doctest-modules --cov=doi src tests - - mypy src/ tests - - flake8 src/ tests -after_success: - - coveralls From 507acb0677467c3de5c705a821e32b09e8489d8d Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Sat, 16 Mar 2024 10:33:17 +0200 Subject: [PATCH 04/11] ci: add tests to ci --- .github/workflows/codeql.yml | 69 ++++++++++-------------------------- .github/workflows/main.yml | 43 ++++++++++++++++++++++ .github/workflows/pyre.yml | 46 ------------------------ 3 files changed, 62 insertions(+), 96 deletions(-) create mode 100644 .github/workflows/main.yml delete mode 100644 .github/workflows/pyre.yml diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index fa89976..52e191d 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -1,23 +1,13 @@ -# For most projects, this workflow file will not need changing; you simply need -# to commit it to your repository. -# -# You may wish to alter this file to override the set of languages analyzed, -# or to provide custom queries or build logic. -# -# ******** NOTE ******** -# We have attempted to detect the languages in your repository. Please check -# the `language` matrix defined below to confirm you have the correct set of -# supported CodeQL languages. -# name: "CodeQL" on: push: branches: [ "master" ] pull_request: - # The branches below must be a subset of the branches above branches: [ "master" ] - + schedule: + # 17:00 on Friday (UTC) + - cron: "00 17 * * 5" jobs: analyze: @@ -32,42 +22,21 @@ jobs: fail-fast: false matrix: language: [ 'python' ] - # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] - # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support steps: - - name: Checkout repository - uses: actions/checkout@v3 - - # Initializes the CodeQL tools for scanning. - - name: Initialize CodeQL - uses: github/codeql-action/init@v2 - with: - languages: ${{ matrix.language }} - # If you wish to specify custom queries, you can do so here or in a config file. - # By default, queries listed here will override any specified in a config file. - # Prefix the list here with "+" to use these queries and those in the config file. - - # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs - # queries: security-extended,security-and-quality - - - # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). - # If this step fails, then you should remove it and run the build manually (see below) - - name: Autobuild - uses: github/codeql-action/autobuild@v2 - - # â„šī¸ Command-line programs to run using the OS shell. - # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun - - # If the Autobuild fails above, remove it and uncomment the following three lines. - # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. - - # - run: | - # echo "Run, Build Application using script" - # ./location_of_script_within_repo/buildscript.sh - - - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 - with: - category: "/language:${{matrix.language}}" + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + queries: +security-and-quality + + - name: Autobuild + uses: github/codeql-action/autobuild@v3 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:${{ matrix.language }}" diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..b049b7c --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,43 @@ +name: CI + +on: + push: + branches: [ "master", "ci-*" ] + pull_request: + branches: [ "master" ] + schedule: + # 17:00 on Friday (UTC) + - cron: "00 17 * * 5" + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13'] + fail-fast: false + + steps: + - uses: actions/checkout@v4 + - name: Set up python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --editable '.[develop,docs]' + shell: bash + + - name: Check linting and type annotations + run: | + python -m flake8 src tests + python -m mypy src tests + shell: bash + + - name: Run tests + if: success() || failure() + run: | + python -m pytest -v -s src tests + shell: bash diff --git a/.github/workflows/pyre.yml b/.github/workflows/pyre.yml deleted file mode 100644 index 21fd46f..0000000 --- a/.github/workflows/pyre.yml +++ /dev/null @@ -1,46 +0,0 @@ -# This workflow uses actions that are not certified by GitHub. -# They are provided by a third-party and are governed by -# separate terms of service, privacy policy, and support -# documentation. - -# This workflow integrates Pyre with GitHub's -# Code Scanning feature. -# -# Pyre is a performant type checker for Python compliant with -# PEP 484. Pyre can analyze codebases with millions of lines -# of code incrementally – providing instantaneous feedback -# to developers as they write code. -# -# See https://pyre-check.org - -name: Pyre - -on: - workflow_dispatch: - push: - branches: [ "master" ] - pull_request: - branches: [ "master" ] - -permissions: - contents: read - -jobs: - pyre: - permissions: - actions: read - contents: read - security-events: write - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - - name: Run Pyre - uses: facebook/pyre-action@60697a7858f7cc8470d8cc494a3cf2ad6b06560d - with: - # To customize these inputs: - # See https://github.com/facebook/pyre-action#inputs - repo-directory: './src/' - From a64941e750783d7ced841b432daf9775e3bd33c9 Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Sat, 16 Mar 2024 10:39:36 +0200 Subject: [PATCH 05/11] style: fix flake8 issues --- src/doi/__init__.py | 40 +++++++++---------- tests/test_doi.py | 96 +++++++++++++++++++++------------------------ 2 files changed, 64 insertions(+), 72 deletions(-) diff --git a/src/doi/__init__.py b/src/doi/__init__.py index e19359f..cd19507 100644 --- a/src/doi/__init__.py +++ b/src/doi/__init__.py @@ -5,7 +5,7 @@ from typing import Optional -__version__ = '0.2.0' +__version__ = "0.2.0" logger = logging.getLogger("doi") # type: logging.Logger @@ -24,9 +24,9 @@ def pdf_to_doi(filepath: str, maxlines: Optional[int] = None) -> Optional[str]: if maxlines is None: maxlines = sys.maxsize - with open(filepath, 'rb') as fd: + with open(filepath, "rb") as fd: for j, line in enumerate(fd): - doi = find_doi_in_text(line.decode('ascii', errors='ignore')) + doi = find_doi_in_text(line.decode("ascii", errors="ignore")) if doi: return doi if j > maxlines: @@ -48,16 +48,16 @@ def validate_doi(doi: str) -> Optional[str]: import urllib.parse import json url = "https://doi.org/api/handles/{doi}".format(doi=doi) - logger.debug('handle url %s', url) + logger.debug("handle url %s", url) request = urllib.request.Request(url) try: result = json.loads(urllib.request.urlopen(request).read().decode()) except HTTPError: - raise ValueError('HTTP 404: DOI not found') + raise ValueError("HTTP 404: DOI not found") else: - urls = [v['data']['value'] - for v in result['values'] if v.get('type') == 'URL'] + urls = [v["data"]["value"] + for v in result["values"] if v.get("type") == "URL"] return urls[0] if urls else None @@ -68,12 +68,12 @@ def get_clean_doi(doi: str) -> str: :param doi: String containing a DOI. :returns: The extracted DOI. """ - doi = re.sub(r'%2F', '/', doi) + doi = re.sub(r"%2F", "/", doi) # For pdfs - doi = re.sub(r'\)>', ' ', doi) - doi = re.sub(r'\)/S/URI', ' ', doi) - doi = re.sub(r'(/abstract)', '', doi) - doi = re.sub(r'\)$', '', doi) + doi = re.sub(r"\)>", " ", doi) + doi = re.sub(r"\)/S/URI", " ", doi) + doi = re.sub(r"(/abstract)", "", doi) + doi = re.sub(r"\)$", "", doi) return doi @@ -87,11 +87,11 @@ def find_doi_in_text(text: str) -> Optional[str]: forbidden_doi_characters = r'"\s%$^\'<>@,;:#?&' # Sometimes it is in the javascript defined var_doi = re.compile( - r'doi(.org)?' - r'\s*(=|:|/|\()\s*' - r'("|\')?' - r'(?P[^{fc}]+)' - r'("|\'|\))?' + r"doi(.org)?" + r"\s*(=|:|/|\()\s*" + r"(\"|')?" + r"(?P[^{fc}]+)" + r"(\"|'|\))?" .format( fc=forbidden_doi_characters ), re.I @@ -102,7 +102,7 @@ def find_doi_in_text(text: str) -> Optional[str]: try: m = next(miter) if m: - doi = m.group('doi') + doi = m.group("doi") return get_clean_doi(doi) except StopIteration: pass @@ -119,8 +119,8 @@ def get_real_url_from_doi(doi: str) -> Optional[str]: if url is None: return url - m = re.match(r'.*linkinghub\.elsevier.*/pii/([A-Z0-9]+).*', url, re.I) + m = re.match(r".*linkinghub\.elsevier.*/pii/([A-Z0-9]+).*", url, re.I) if m: - return ('https://www.sciencedirect.com/science/article/abs/pii/{pii}' + return ("https://www.sciencedirect.com/science/article/abs/pii/{pii}" .format(pii=m.group(1))) return url diff --git a/tests/test_doi.py b/tests/test_doi.py index 4d3d478..fa51492 100644 --- a/tests/test_doi.py +++ b/tests/test_doi.py @@ -1,52 +1,44 @@ -"""Tests for `doi` package.""" - import os -from pkg_resources import parse_version import pytest from doi import ( - validate_doi, find_doi_in_text, __version__, pdf_to_doi, + validate_doi, find_doi_in_text, pdf_to_doi, get_real_url_from_doi ) -def test_valid_version() -> None: - """Check that the package defines a valid __version__""" - assert parse_version(__version__) >= parse_version("0.1.0") - - @pytest.mark.net def test_validate_doi() -> None: data = [ - ('10.1063/1.5081715', - 'http://aip.scitation.org/doi/10.1063/1.5081715'), - ('10.1007%2FBF01451751', - 'http://link.springer.com/10.1007/BF01451751'), - ('10.1103/PhysRevLett.49.57', - 'https://link.aps.org/doi/10.1103/PhysRevLett.49.57'), - ('10.1080/14786442408634457', - 'https://www.tandfonline.com/doi/full/10.1080/14786442408634457'), - ('10.1021/jp003647e', 'https://pubs.acs.org/doi/10.1021/jp003647e'), - ('10.1016/S0009-2614(97)04014-1', - 'https://linkinghub.elsevier.com/retrieve/pii/S0009261497040141'), + ("10.1063/1.5081715", + "http://aip.scitation.org/doi/10.1063/1.5081715"), + ("10.1007%2FBF01451751", + "http://link.springer.com/10.1007/BF01451751"), + ("10.1103/PhysRevLett.49.57", + "https://link.aps.org/doi/10.1103/PhysRevLett.49.57"), + ("10.1080/14786442408634457", + "https://www.tandfonline.com/doi/full/10.1080/14786442408634457"), + ("10.1021/jp003647e", "https://pubs.acs.org/doi/10.1021/jp003647e"), + ("10.1016/S0009-2614(97)04014-1", + "https://linkinghub.elsevier.com/retrieve/pii/S0009261497040141"), ] for doi, url in data: assert url == validate_doi(doi) - for doi in ['', 'asdf']: + for doi in ["", "asdf"]: try: validate_doi(doi) except ValueError as e: - assert str(e) == 'HTTP 404: DOI not found' + assert str(e) == "HTTP 404: DOI not found" @pytest.mark.net def test_get_real_url_from_doi() -> None: data = [ - ('10.1016/S0009-2614(97)04014-1', - 'https://www.sciencedirect.com/science/' - 'article/abs/pii/S0009261497040141'), + ("10.1016/S0009-2614(97)04014-1", + "https://www.sciencedirect.com/science/" + "article/abs/pii/S0009261497040141"), ] for doi, url in data: assert url == get_real_url_from_doi(doi) @@ -54,40 +46,40 @@ def test_get_real_url_from_doi() -> None: def test_find_doi_in_line() -> None: test_data = [ - ('http://dx.doi.org/10.1063/1.881498', '10.1063/1.881498'), - ('http://dx.doi.org/10.1063%2F1.881498', '10.1063/1.881498'), - (2*'qer '+'var doi = "12345/12345.3"', '12345/12345.3'), - (2*'qer '+"var doi = '12345/12345.3';fas", '12345/12345.3'), - (2*'qer '+"var DoI = 12345%2F12345.3", '12345/12345.3'), - (2*'qer '+"var DoI : 12345%2F12345.3", '12345/12345.3'), - ('http://scitation.org/doi/10.1063/1.881498', '10.1063/1.881498'), - ('org/doi(10.1063/1.881498)', '10.1063/1.881498'), - ('/scitation.org/doi/10.1063/1.881498?234saf=34', '10.1063/1.881498'), - ('/scitation.org/doi/10.1063/1.88149 8?234saf=34', '10.1063/1.88149'), - ('/scitation.org/doi/10.1063/1.uniau12?as=234', - '10.1063/1.uniau12'), - ('https://doi.org/10.1093/analys/anw053', '10.1093/analys/anw053'), - ('http://.scitation.org/doi/10.1063/1.mart(88)1498?asdfwer', - '10.1063/1.mart(88)1498'), - ('@ibook{doi:10.1002/9780470125915.ch2,', '10.1002/9780470125915.ch2'), + ("http://dx.doi.org/10.1063/1.881498", "10.1063/1.881498"), + ("http://dx.doi.org/10.1063%2F1.881498", "10.1063/1.881498"), + (2 * "qer " + "var doi = '12345/12345.3'", "12345/12345.3"), + (2 * "qer " + "var doi = '12345/12345.3';fas", "12345/12345.3"), + (2 * "qer " + "var DoI = 12345%2F12345.3", "12345/12345.3"), + (2 * "qer " + "var DoI : 12345%2F12345.3", "12345/12345.3"), + ("http://scitation.org/doi/10.1063/1.881498", "10.1063/1.881498"), + ("org/doi(10.1063/1.881498)", "10.1063/1.881498"), + ("/scitation.org/doi/10.1063/1.881498?234saf=34", "10.1063/1.881498"), + ("/scitation.org/doi/10.1063/1.88149 8?234saf=34", "10.1063/1.88149"), + ("/scitation.org/doi/10.1063/1.uniau12?as=234", + "10.1063/1.uniau12"), + ("https://doi.org/10.1093/analys/anw053", "10.1093/analys/anw053"), + ("http://.scitation.org/doi/10.1063/1.mart(88)1498?asdfwer", + "10.1063/1.mart(88)1498"), + ("@ibook{doi:10.1002/9780470125915.ch2,", "10.1002/9780470125915.ch2"), ('application/pdf' - 'doi:10.1063/1.5079474', - '10.1063/1.5079474'), - ('<(DOI:10.1002/9780470915.CH2)/S/URI,', '10.1002/9780470915.CH2'), - ('URL<(DOI:10.1002/9780470125915.CH2,', '10.1002/9780470125915.CH2'), - (r'A<>/' - r'Border[0 0 0]/M(D:20181022082356+0530)/Rect[147.40158 594.36926' - r'347.24957 605.36926]/Subtype/Link/Type/A', - '10.1016/j.comptc.2018.10.004'), - ('doi(10.1038/s41535-018-0103-6;)', '10.1038/s41535-018-0103-6'), + "doi:10.1063/1.5079474", + "10.1063/1.5079474"), + ("<(DOI:10.1002/9780470915.CH2)/S/URI,", "10.1002/9780470915.CH2"), + ("URL<(DOI:10.1002/9780470125915.CH2,", "10.1002/9780470125915.CH2"), + (r"A<>/" + r"Border[0 0 0]/M(D:20181022082356+0530)/Rect[147.40158 594.36926" + r"347.24957 605.36926]/Subtype/Link/Type/A", + "10.1016/j.comptc.2018.10.004"), + ("doi(10.1038/s41535-018-0103-6;)", "10.1038/s41535-018-0103-6"), ] for url, doi in test_data: assert find_doi_in_text(url) == doi def test_doi_from_pdf() -> None: - f = os.path.join(os.path.dirname(__file__), 'resources', 'doc.pdf') + f = os.path.join(os.path.dirname(__file__), "resources", "doc.pdf") assert os.path.exists(f) - assert pdf_to_doi(f) == '10.1103/PhysRevLett.50.1998' + assert pdf_to_doi(f) == "10.1103/PhysRevLett.50.1998" From 844fdfd3ea28d973379513b853f960a5fa85dde2 Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Sat, 16 Mar 2024 15:32:15 +0200 Subject: [PATCH 06/11] tests: fix tests --- tests/test_doi.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tests/test_doi.py b/tests/test_doi.py index fa51492..bb3a763 100644 --- a/tests/test_doi.py +++ b/tests/test_doi.py @@ -12,16 +12,17 @@ def test_validate_doi() -> None: data = [ ("10.1063/1.5081715", - "http://aip.scitation.org/doi/10.1063/1.5081715"), + "https://pubs.aip.org/jcp/article/150/7/074102/197572/Exact-two-component-equation-of-motion-coupled"), # noqa: E501 ("10.1007%2FBF01451751", - "http://link.springer.com/10.1007/BF01451751"), + "http://link.springer.com/10.1007/BF01451751"), ("10.1103/PhysRevLett.49.57", - "https://link.aps.org/doi/10.1103/PhysRevLett.49.57"), + "https://link.aps.org/doi/10.1103/PhysRevLett.49.57"), ("10.1080/14786442408634457", - "https://www.tandfonline.com/doi/full/10.1080/14786442408634457"), - ("10.1021/jp003647e", "https://pubs.acs.org/doi/10.1021/jp003647e"), + "https://www.tandfonline.com/doi/full/10.1080/14786442408634457"), + ("10.1021/jp003647e", + "https://pubs.acs.org/doi/10.1021/jp003647e"), ("10.1016/S0009-2614(97)04014-1", - "https://linkinghub.elsevier.com/retrieve/pii/S0009261497040141"), + "https://linkinghub.elsevier.com/retrieve/pii/S0009261497040141"), ] for doi, url in data: assert url == validate_doi(doi) @@ -57,21 +58,21 @@ def test_find_doi_in_line() -> None: ("/scitation.org/doi/10.1063/1.881498?234saf=34", "10.1063/1.881498"), ("/scitation.org/doi/10.1063/1.88149 8?234saf=34", "10.1063/1.88149"), ("/scitation.org/doi/10.1063/1.uniau12?as=234", - "10.1063/1.uniau12"), + "10.1063/1.uniau12"), ("https://doi.org/10.1093/analys/anw053", "10.1093/analys/anw053"), ("http://.scitation.org/doi/10.1063/1.mart(88)1498?asdfwer", - "10.1063/1.mart(88)1498"), + "10.1063/1.mart(88)1498"), ("@ibook{doi:10.1002/9780470125915.ch2,", "10.1002/9780470125915.ch2"), ('application/pdf' "doi:10.1063/1.5079474", - "10.1063/1.5079474"), + "10.1063/1.5079474"), ("<(DOI:10.1002/9780470915.CH2)/S/URI,", "10.1002/9780470915.CH2"), ("URL<(DOI:10.1002/9780470125915.CH2,", "10.1002/9780470125915.CH2"), (r"A<>/" r"Border[0 0 0]/M(D:20181022082356+0530)/Rect[147.40158 594.36926" r"347.24957 605.36926]/Subtype/Link/Type/A", - "10.1016/j.comptc.2018.10.004"), + "10.1016/j.comptc.2018.10.004"), ("doi(10.1038/s41535-018-0103-6;)", "10.1038/s41535-018-0103-6"), ] for url, doi in test_data: From 34ea67ce89b16ceac5bb47df5ab7a098d6c06bc6 Mon Sep 17 00:00:00 2001 From: gesh Date: Mon, 23 Jun 2025 21:59:08 +0300 Subject: [PATCH 07/11] tests: Configure pytest to ignore docs --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 1e3bc83..aa6195c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,6 +69,7 @@ multiline-quotes = "double" addopts = [ "--doctest-modules", "--cov=src/doi", + "--ignore=docs", ] markers = [ "net: marks tests that call use the net" From 2e4b622e00d72097e369b7be5ad61e107ebf3f8d Mon Sep 17 00:00:00 2001 From: gesh Date: Tue, 15 Apr 2025 21:07:43 +0300 Subject: [PATCH 08/11] Resolve redirects when testing URLs for equality The URL DOIs resolve to can move around, with redirects pointing to the new location. To make the tests more robust, only fail if the URLs differ after redirections. See also https://www.crossref.org/blog/urls-and-dois-a-complicated-relationship/ --- tests/test_doi.py | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/tests/test_doi.py b/tests/test_doi.py index bb3a763..d062379 100644 --- a/tests/test_doi.py +++ b/tests/test_doi.py @@ -1,5 +1,9 @@ import os +from urllib.request import Request, urlopen +from urllib.parse import urlparse, urlunparse +from warnings import warn + import pytest from doi import ( @@ -8,6 +12,30 @@ ) +def simplify_url(u): + return urlparse(u)._replace(query='', fragment='') + + +def resolve_redirects(u): + # Unconditionally upgrade to https, since some resolvers seem to require it + # If removed, it'd make sense to canonicalize in simplify_url instead to + # prevent spurious test failures + u = urlunparse(urlparse(u)._replace(scheme='https')) + req = Request(u, headers={'User-Agent': 'Mozilla/5.0'}) + with urlopen(req) as r: + return simplify_url(r.url) + + +def normalize_eq(u, v): + if u == v: + return True + warn(f"{u} textually differs from {v}, please update the relevant case.\n" + "Attempting to recover by resolving redirects") + return (simplify_url(u) == simplify_url(v) + or resolve_redirects(u) == resolve_redirects(v) + ) + + @pytest.mark.net def test_validate_doi() -> None: data = [ @@ -25,7 +53,7 @@ def test_validate_doi() -> None: "https://linkinghub.elsevier.com/retrieve/pii/S0009261497040141"), ] for doi, url in data: - assert url == validate_doi(doi) + assert normalize_eq(url, validate_doi(doi)) for doi in ["", "asdf"]: try: @@ -42,7 +70,7 @@ def test_get_real_url_from_doi() -> None: "article/abs/pii/S0009261497040141"), ] for doi, url in data: - assert url == get_real_url_from_doi(doi) + assert normalize_eq(url, get_real_url_from_doi(doi)) def test_find_doi_in_line() -> None: From ca6dcf0f03e8402ef13ea3375eb96734af77dcfe Mon Sep 17 00:00:00 2001 From: gesh Date: Sun, 29 Jun 2025 17:15:02 +0300 Subject: [PATCH 09/11] Use cloudscraper to solve cloudflare challenges Also put in a fallback using requests, but it is hacky and only works sometimes. cloudscraper stands a better chance of consistently being able to get to the final URL --- pyproject.toml | 5 +++++ tests/test_doi.py | 44 +++++++++++++++++++++++++++++++++++++------- 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index aa6195c..faba3d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,11 @@ docs = [ "sphinx>=4", "sphinx_rtd_theme>=1", ] +# For solving client-side challenges on DDoS-protected sites +# (eg those using CloudFlare) +challenges = [ + "cloudscraper", +] [project.urls] Repository = "https://github.com/papis/python-doi" diff --git a/tests/test_doi.py b/tests/test_doi.py index d062379..1668af0 100644 --- a/tests/test_doi.py +++ b/tests/test_doi.py @@ -1,6 +1,10 @@ import os -from urllib.request import Request, urlopen +import requests +try: + import cloudscraper +except ImportError: + cloudscraper = None from urllib.parse import urlparse, urlunparse from warnings import warn @@ -21,21 +25,47 @@ def resolve_redirects(u): # If removed, it'd make sense to canonicalize in simplify_url instead to # prevent spurious test failures u = urlunparse(urlparse(u)._replace(scheme='https')) - req = Request(u, headers={'User-Agent': 'Mozilla/5.0'}) - with urlopen(req) as r: - return simplify_url(r.url) + if cloudscraper: + scraper = cloudscraper.create_scraper() + return simplify_url(scraper.get(u).url) -def normalize_eq(u, v): + # Try emulating a browser to not get blocked + h = {'User-Agent': 'Mozilla/5.0'} + resp = requests.get(u, headers=h) + return simplify_url(resp.url) + + +def normalize_eq(u, v, expect_diff=False): if u == v: return True - warn(f"{u} textually differs from {v}, please update the relevant case.\n" - "Attempting to recover by resolving redirects") + if not expect_diff: + warn(f"{u} textually differs from {v}, please update the relevant case.\n" + "Attempting to recover by resolving redirects") return (simplify_url(u) == simplify_url(v) or resolve_redirects(u) == resolve_redirects(v) ) +@pytest.mark.net +@pytest.mark.parametrize( + "needs_cloudscraper, urls", + [ + (True, + ["http://pubs.aip.org/aip/jcp/article/150/7/074102/197572/Exact-two-component-equation-of-motion-coupled", # noqa: E501 + "http://pubs.aip.org/jcp/article/150/7/074102/197572/Exact-two-component-equation-of-motion-coupled", # noqa: E501 + "http://aip.scitation.org/doi/10.1063/1.5081715" + ]), + ] +) +def test_redirect(needs_cloudscraper, urls) -> None: + base = urls[0] + if needs_cloudscraper and cloudscraper is None: + pytest.skip(f"cloudscraper needed to solve CloudFlare challenge on {base}") + for other in urls[1:]: + assert normalize_eq(base, other, expect_diff=True) + + @pytest.mark.net def test_validate_doi() -> None: data = [ From 8e5f3c9228c7ad4348a6af97e8192fac57c3865d Mon Sep 17 00:00:00 2001 From: gesh Date: Sun, 29 Jun 2025 17:13:54 +0300 Subject: [PATCH 10/11] Parametrize tests This eg makes it easier to spot which particular iteration breaks --- tests/test_doi.py | 48 ++++++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/tests/test_doi.py b/tests/test_doi.py index 1668af0..91f9e51 100644 --- a/tests/test_doi.py +++ b/tests/test_doi.py @@ -67,8 +67,9 @@ def test_redirect(needs_cloudscraper, urls) -> None: @pytest.mark.net -def test_validate_doi() -> None: - data = [ +@pytest.mark.parametrize( + "doi,url", + [ ("10.1063/1.5081715", "https://pubs.aip.org/jcp/article/150/7/074102/197572/Exact-two-component-equation-of-motion-coupled"), # noqa: E501 ("10.1007%2FBF01451751", @@ -82,29 +83,41 @@ def test_validate_doi() -> None: ("10.1016/S0009-2614(97)04014-1", "https://linkinghub.elsevier.com/retrieve/pii/S0009261497040141"), ] - for doi, url in data: - assert normalize_eq(url, validate_doi(doi)) +) +def test_validate_doi(doi, url) -> None: + assert normalize_eq(url, validate_doi(doi)) + - for doi in ["", "asdf"]: - try: - validate_doi(doi) - except ValueError as e: - assert str(e) == "HTTP 404: DOI not found" +@pytest.mark.parametrize( + "doi", + [ + "", + "asdf" + ] +) +def test_validate_invalid_doi(doi) -> None: + try: + validate_doi(doi) + except ValueError as e: + assert str(e) == "HTTP 404: DOI not found" @pytest.mark.net -def test_get_real_url_from_doi() -> None: - data = [ +@pytest.mark.parametrize( + "doi,url", + [ ("10.1016/S0009-2614(97)04014-1", "https://www.sciencedirect.com/science/" "article/abs/pii/S0009261497040141"), ] - for doi, url in data: - assert normalize_eq(url, get_real_url_from_doi(doi)) +) +def test_get_real_url_from_doi(doi, url) -> None: + assert normalize_eq(url, get_real_url_from_doi(doi)) -def test_find_doi_in_line() -> None: - test_data = [ +@pytest.mark.parametrize( + "url, doi", + [ ("http://dx.doi.org/10.1063/1.881498", "10.1063/1.881498"), ("http://dx.doi.org/10.1063%2F1.881498", "10.1063/1.881498"), (2 * "qer " + "var doi = '12345/12345.3'", "12345/12345.3"), @@ -133,8 +146,9 @@ def test_find_doi_in_line() -> None: "10.1016/j.comptc.2018.10.004"), ("doi(10.1038/s41535-018-0103-6;)", "10.1038/s41535-018-0103-6"), ] - for url, doi in test_data: - assert find_doi_in_text(url) == doi +) +def test_find_doi_in_line(url, doi) -> None: + assert find_doi_in_text(url) == doi def test_doi_from_pdf() -> None: From ab9d72ae86c3ef83cdf2e56793875057b0d47788 Mon Sep 17 00:00:00 2001 From: gesh Date: Sun, 29 Jun 2025 18:06:03 +0300 Subject: [PATCH 11/11] Make test_redirect cases prettier --- tests/test_doi.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_doi.py b/tests/test_doi.py index 91f9e51..1049b8b 100644 --- a/tests/test_doi.py +++ b/tests/test_doi.py @@ -47,9 +47,15 @@ def normalize_eq(u, v, expect_diff=False): ) +def listmin(param): + if isinstance(param, list): + return min(param) + return "" + + @pytest.mark.net @pytest.mark.parametrize( - "needs_cloudscraper, urls", + "needs_cloudscraper, urls", ids=listmin, argvalues= [ (True, ["http://pubs.aip.org/aip/jcp/article/150/7/074102/197572/Exact-two-component-equation-of-motion-coupled", # noqa: E501