Skip to content

Commit 9e6d029

Browse files
Merge pull request #6 from LCAS/copilot/refactor-figshare-script
Refactor figshare.py into multiple scripts for better structure
2 parents 68e298e + f238706 commit 9e6d029

File tree

8 files changed

+886
-704
lines changed

8 files changed

+886
-704
lines changed

.github/workflows/figshare-processing.yaml

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,19 +59,26 @@ jobs:
5959
python -m pip install --upgrade pip
6060
pip install -r requirements-frozen.txt
6161
62-
- name: Run figshare exporter
62+
- name: Run figshare fetch (Step 1 - Retrieve articles and create CSV)
6363
env:
6464
FIGSHARE_TOKEN: ${{ secrets.FIGSHARE_TOKEN }}
6565
run: |
6666
set -e
6767
cd ./output
6868
if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ github.event.inputs.use_author_cache }}" = "true" ]; then
69-
echo "Running with --use-author-cache (manually triggered)"
70-
python ../figshare.py --use-author-cache
69+
echo "Running figshare_fetch.py with --use-author-cache (manually triggered)"
70+
python ../figshare_fetch.py --use-author-cache
7171
else
72-
echo "Running without cache (default behavior)"
73-
python ../figshare.py --rate-limit-delay 3
72+
echo "Running figshare_fetch.py without cache (default behavior)"
73+
python ../figshare_fetch.py --rate-limit-delay 1 --max-retries 30
7474
fi
75+
76+
- name: Run figshare bibtex (Step 2 - Generate bibtex from CSV)
77+
run: |
78+
set -e
79+
cd ./output
80+
echo "Running figshare_bibtex.py to generate bibtex from CSV"
81+
python ../figshare_bibtex.py
7582
7683
- name: Save Cache from folder ./output
7784
uses: actions/cache/save@v5

author.py

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
import pandas as pd
5+
import shelve
6+
import re
7+
from logging import getLogger, INFO, DEBUG
8+
from flatten_dict import flatten
9+
10+
from figshare_api import FigShare
11+
from doi2bib import doi2bib
12+
from doi_utils import guess_doi_from_crossref
13+
14+
15+
class Author:
16+
def __init__(self, name, debug=False, rate_limit_delay=1.0, max_retries=5):
17+
self.logger = getLogger("Author")
18+
if debug:
19+
self.logger.setLevel(DEBUG)
20+
self.name = name
21+
self.fs = FigShare(rate_limit_delay=rate_limit_delay, max_retries=max_retries)
22+
self.articles = {}
23+
self.public_html_prefix = "https://repository.lincoln.ac.uk"
24+
self.df = None
25+
26+
def save(self, filename=None):
27+
if filename is None:
28+
filename = f"{self.name}.db"
29+
with shelve.open(filename) as db:
30+
db['articles'] = self.articles
31+
db['df'] = self.df
32+
33+
def load(self, filename=None):
34+
if filename is None:
35+
filename = f"{self.name}.db"
36+
with shelve.open(filename) as db:
37+
self.articles = db['articles']
38+
self.df = db['df']
39+
40+
41+
def _retrieve_figshare(self, use_cache=True):
42+
self.logger.info(f"retrieving articles for {self.name}")
43+
self.articles = self.fs.articles_by_user_name(self.name, use_cache=use_cache)
44+
45+
self.logger.info(f"found {len(self.articles)} articles for {self.name}")
46+
47+
def _retrieve_details(self, use_cache=True):
48+
for article in self.articles:
49+
self.logger.info(f"retrieving details for article {article['id']}")
50+
article['details'] = self.fs.get_article(article['id'], use_cache=use_cache)
51+
52+
def _remove_non_repository(self):
53+
self.logger.info(f"removing non-repository articles out of {len(self.articles)}")
54+
self.articles = [a for a in self.articles if a['url_public_html'].startswith(self.public_html_prefix)]
55+
self.logger.info(f"retained {len(self.articles)} articles")
56+
57+
def _custom_fields_to_dicts(self):
58+
for article in self.articles:
59+
if 'details' not in article:
60+
continue
61+
if 'custom_fields' not in article['details']:
62+
continue
63+
self.logger.debug(f"convert")
64+
65+
cf = article['details']['custom_fields']
66+
if type(cf) == list:
67+
new_cf = {}
68+
for p in cf:
69+
new_cf[p['name']] = p['value']
70+
article['details']['custom_fields'] = new_cf
71+
72+
73+
def _guess_doi(self, article):
74+
"""
75+
Use crossref API to guess the DOI for an article based on the title and authors
76+
"""
77+
if 'title' not in article or not article['title']:
78+
self.logger.warning("No title found for article, can't guess DOI")
79+
return None
80+
81+
title = article['title']
82+
author = article['author']
83+
84+
return guess_doi_from_crossref(title, author)
85+
86+
87+
def _retrieve_bibtex_from_dois(self):
88+
if self.df is None:
89+
self.logger.warning(f"no dataframe found for {self.name}, can't continue")
90+
return
91+
doi2bibber = doi2bib()
92+
# iterate over all rows in the dataframe self.df
93+
for index, row in self.df.iterrows():
94+
doi = row['External DOI']
95+
# Check if DOI is in valid format
96+
if doi and isinstance(doi, str):
97+
# Basic DOI validation - should start with 10. followed by numbers/dots/hyphens
98+
if not doi.startswith('10.') or not len(doi.split('/', 1)) == 2:
99+
self.logger.warning(f"Invalid DOI format: {doi}, will try to guess")
100+
doi = None
101+
else:
102+
self.logger.info(f"No DOI defined in record for article, will try to guess")
103+
doi = None
104+
if doi is None:
105+
doi = self._guess_doi(row)
106+
if doi is None:
107+
self.logger.debug(f"Unable to guess DOI for article, no option left but to skip it")
108+
continue
109+
self.logger.info(f"Guessed DOI for article: {doi}, updating dataframe")
110+
self.df.at[index, 'External DOI'] = doi
111+
try:
112+
bibtex = doi2bibber.get_bibtex_entry(doi)
113+
# Update the dataframe with the bibtex information
114+
if bibtex is not None:
115+
self.df.at[index, 'bibtex'] = bibtex
116+
self.df.at[index, 'bibtex_str'] = doi2bibber.entries_to_str([bibtex])
117+
self.logger.info(f"got bibtex for {doi}")
118+
else:
119+
self.logger.warning(f"Couldn't get bibtex for {doi}")
120+
121+
except Exception as e:
122+
self.logger.warning(f"Failed to get bibtex for {doi}: {e}")
123+
124+
def _flatten(self):
125+
new_articles = []
126+
for a in self.articles:
127+
new_articles.append(flatten(a, reducer='path'))
128+
self.articles = new_articles
129+
130+
def retrieve(self, use_cache=True):
131+
self._retrieve_figshare(use_cache=use_cache)
132+
self._remove_non_repository()
133+
self._retrieve_details(use_cache=True)
134+
self._custom_fields_to_dicts()
135+
self._flatten()
136+
self._create_dataframe()
137+
self._retrieve_bibtex_from_dois()
138+
139+
def _create_dataframe(self):
140+
if len(self.articles) == 0:
141+
self.logger.warning(f"no articles found for {self.name}, can't create dataframe")
142+
self.df = None
143+
return
144+
self.df = pd.DataFrame.from_dict(self.articles)
145+
# add column with author name
146+
self.df['author'] = self.name
147+
# add column with online date (as datetime object)
148+
self.df['online_date'] = pd.to_datetime(self.df['timeline/firstOnline'], utc=True)
149+
# add column with online year
150+
self.df['online_year'] = self.df['online_date'].apply(
151+
lambda x: x.year
152+
)
153+
# add column with external DOI, parsed from custom_fields
154+
self.df['External DOI'] = self.df['details/custom_fields/External DOI'].apply(
155+
lambda x: re.sub(r'^(?:https?://doi\.org/|doi:)', '', x[0], flags=re.IGNORECASE).replace('doi:','')
156+
if isinstance(x, list) and len(x) > 0 else None
157+
)
158+
159+
160+
161+
return self.df

doi2bib.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
import urllib.request
5+
import requests
6+
import bibtexparser
7+
from bibtexparser.bparser import BibTexParser
8+
from bibtexparser.bibdatabase import BibDatabase
9+
import shelve
10+
from logging import getLogger, INFO
11+
12+
13+
class doi2bib:
14+
15+
def __init__(self):
16+
self.bibtext_cache_file = "bibtext_cache"
17+
self.shortdoi_cache_file = "shortdoi_cache"
18+
self.logger = getLogger("doi2bib")
19+
self.logger.setLevel(INFO)
20+
21+
22+
def shorten(self, doi):
23+
"""
24+
Get the shortDOI for a DOI. Providing a cache dictionary will prevent
25+
multiple API requests for the same DOI.
26+
"""
27+
with shelve.open(self.shortdoi_cache_file) as cache:
28+
if doi in cache:
29+
self.logger.debug(f"short doi for {doi} found in cache")
30+
return cache[doi]
31+
quoted_doi = urllib.request.quote(doi)
32+
url = 'http://shortdoi.org/{}?format=json'.format(quoted_doi)
33+
try:
34+
response = requests.get(url)
35+
# Check if response is valid and contains JSON
36+
if response.ok and response.headers.get('Content-Type', '').lower().startswith('application/json') and response.text.strip():
37+
result = response.json()
38+
short_doi = result['ShortDOI']
39+
else:
40+
self.logger.warning(f"Received empty or invalid JSON response for {doi} from {url} (status: {response.status_code})")
41+
return None
42+
except Exception as e:
43+
self.logger.warning(f"failed to get short doi for {doi}: {e}")
44+
return None
45+
self.logger.debug(f"short doi for {doi} is {short_doi}, caching it")
46+
cache[doi] = short_doi
47+
return short_doi
48+
49+
def get_bibtext(self, doi):
50+
"""
51+
Use DOI Content Negotiation (http://crosscite.org/cn/) to retrieve a string
52+
with the bibtex entry.
53+
"""
54+
with shelve.open(self.bibtext_cache_file) as cache:
55+
if doi in cache:
56+
self.logger.debug(f"bibtex for {doi} found in cache")
57+
return cache[doi]
58+
url = 'https://doi.org/' + urllib.request.quote(doi)
59+
header = {
60+
'Accept': 'application/x-bibtex',
61+
}
62+
response = requests.get(url, headers=header)
63+
if not response.ok:
64+
self.logger.warning(f"failed to get bibtex for {doi}, status code {response.status_code}")
65+
return ""
66+
bibtext = response.text
67+
if bibtext:
68+
self.logger.debug(f"bibtex for {doi} found, caching it")
69+
cache[doi] = bibtext
70+
else:
71+
self.logger.warning(f"failed to get bibtex for {doi}")
72+
return bibtext
73+
74+
def get_bibtex_entry(self, doi):
75+
"""
76+
Return a bibtexparser entry for a DOI
77+
"""
78+
bibtext = self.get_bibtext(doi)
79+
if not bibtext:
80+
return None
81+
82+
short_doi = self.shorten(doi)
83+
parser = BibTexParser()
84+
parser.ignore_nonstandard_types = False
85+
bibdb = bibtexparser.loads(bibtext, parser)
86+
entry, = bibdb.entries
87+
quoted_doi = urllib.request.quote(doi)
88+
entry['link'] = 'https://doi.org/{}'.format(quoted_doi)
89+
if 'author' in entry:
90+
entry['author'] = ' and '.join(entry['author'].rstrip(';').split('; '))
91+
entry['ID'] = short_doi[3:]
92+
return entry
93+
94+
def entries_to_str(self, entries):
95+
"""
96+
Pass a list of bibtexparser entries and return a bibtex formatted string.
97+
"""
98+
db = BibDatabase()
99+
db.entries = entries
100+
return bibtexparser.dumps(db)

doi_utils.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
"""
4+
Utility functions for DOI guessing and retrieval
5+
"""
6+
7+
import requests
8+
import shelve
9+
from logging import getLogger
10+
from difflib import SequenceMatcher
11+
12+
logger = getLogger("doi_utils")
13+
14+
15+
def guess_doi_from_crossref(title, author):
16+
"""
17+
Use crossref API to guess the DOI for an article based on the title and authors
18+
19+
Args:
20+
title: Article title
21+
author: Author name
22+
23+
Returns:
24+
DOI string if found, None otherwise
25+
"""
26+
with shelve.open("crossref_cache.db") as cache:
27+
if not title:
28+
logger.warning("No title found for article, can't guess DOI")
29+
return None
30+
31+
if title in cache:
32+
logger.info(f"Found DOI {cache[title]} in cache for title: {title}")
33+
return cache[title]
34+
35+
# Construct query URL for Crossref API
36+
base_url = "https://api.crossref.org/works"
37+
params = {
38+
"query.query.bibliographic": f"{title}",
39+
"query.author": f"{author}",
40+
"sort": "relevance",
41+
"rows": 10, # Get top 10 matches
42+
"select": "DOI,title,author",
43+
}
44+
45+
try:
46+
47+
logger.debug(f"Querying Crossref for title: {title}")
48+
response = requests.get(base_url, params=params)
49+
response.raise_for_status()
50+
51+
# Check if response is valid and contains JSON
52+
if response.ok and response.headers.get('Content-Type', '').lower().startswith('application/json') and response.text.strip():
53+
data = response.json()
54+
else:
55+
logger.warning(f"Received empty or invalid JSON response from Crossref API (status: {response.status_code})")
56+
return None
57+
58+
if data["message"]["total-results"] == 0:
59+
logger.debug(f"No DOI found for: {title}")
60+
return None
61+
62+
# Get all matches and find the best one using fuzzy matching
63+
items = data["message"]["items"]
64+
if items:
65+
logger.debug(f"Found {len(items)} potential matches for title: {title}")
66+
67+
best_match = None
68+
best_score = 0
69+
threshold = 0.8 # Minimum similarity score to accept a match
70+
71+
for item in items:
72+
if "title" in item and item["title"]:
73+
item_title = item["title"][0]
74+
# Calculate similarity score
75+
score = SequenceMatcher(None, title.lower(), item_title.lower()).ratio()
76+
logger.debug(f"==== '{title}' == '{item['title'][0]}'??? ==> {score:.2f}")
77+
78+
if score > best_score:
79+
best_score = score
80+
best_match = item
81+
82+
if best_match and best_score >= threshold:
83+
doi = best_match.get("DOI")
84+
authors_string = str(best_match.get("author", ""))
85+
authors_last_name = author.split()[-1]
86+
87+
if doi and authors_last_name in authors_string:
88+
logger.info(f"Found DOI {doi} for title: {title} (match score: {best_score:.2f})")
89+
cache[title] = doi
90+
return doi
91+
else:
92+
logger.warning(f"DOI found but author {authors_last_name} not in authors list or DOI missing")
93+
else:
94+
logger.warning(f"No good title match found. Best score was {best_score:.2f}, below threshold {threshold}")
95+
if best_match and 'title' in best_match:
96+
logger.warning(f" '{title}' != '{best_match['title'][0]}' (score: {best_score:.2f})")
97+
98+
return None
99+
100+
except Exception as e:
101+
logger.warning(f"Error guessing DOI: {e}")
102+
103+
return None

0 commit comments

Comments
 (0)