From bd56667026d5e67a7a13c70d9c8ebadb8ed16f3c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 18:18:51 +0000 Subject: [PATCH 1/5] Initial plan From 27634be873c31549ba95e94d5d205a9b1475d3ed Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 18:28:05 +0000 Subject: [PATCH 2/5] Refactor figshare.py into multiple files and 2-step process Co-authored-by: marc-hanheide <1153084+marc-hanheide@users.noreply.github.com> --- .github/workflows/figshare-processing.yaml | 17 +- author.py | 234 ++++++++++++++++++++ doi2bib.py | 100 +++++++++ figshare_api.py | 161 ++++++++++++++ figshare_bibtex.py | 241 +++++++++++++++++++++ figshare_fetch.py | 171 +++++++++++++++ 6 files changed, 919 insertions(+), 5 deletions(-) create mode 100644 author.py create mode 100644 doi2bib.py create mode 100644 figshare_api.py create mode 100644 figshare_bibtex.py create mode 100644 figshare_fetch.py diff --git a/.github/workflows/figshare-processing.yaml b/.github/workflows/figshare-processing.yaml index 7ad8c5d..dd45b4d 100644 --- a/.github/workflows/figshare-processing.yaml +++ b/.github/workflows/figshare-processing.yaml @@ -59,19 +59,26 @@ jobs: python -m pip install --upgrade pip pip install -r requirements-frozen.txt - - name: Run figshare exporter + - name: Run figshare fetch (Step 1 - Retrieve articles and create CSV) env: FIGSHARE_TOKEN: ${{ secrets.FIGSHARE_TOKEN }} run: | set -e cd ./output if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ github.event.inputs.use_author_cache }}" = "true" ]; then - echo "Running with --use-author-cache (manually triggered)" - python ../figshare.py --use-author-cache + echo "Running figshare_fetch.py with --use-author-cache (manually triggered)" + python ../figshare_fetch.py --use-author-cache else - echo "Running without cache (default behavior)" - python ../figshare.py --rate-limit-delay 3 + echo "Running figshare_fetch.py without cache (default behavior)" + python ../figshare_fetch.py --rate-limit-delay 3 fi + + - name: Run figshare bibtex (Step 2 - Generate bibtex from CSV) + run: | + set -e + cd ./output + echo "Running figshare_bibtex.py to generate bibtex from CSV" + python ../figshare_bibtex.py - name: Save Cache from folder ./output uses: actions/cache/save@v5 diff --git a/author.py b/author.py new file mode 100644 index 0000000..e94056f --- /dev/null +++ b/author.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import pandas as pd +import shelve +import re +import requests +from logging import getLogger, INFO, DEBUG +from flatten_dict import flatten +from difflib import SequenceMatcher + +from figshare_api import FigShare +from doi2bib import doi2bib + + +class Author: + def __init__(self, name, debug=False, rate_limit_delay=1.0, max_retries=5): + self.logger = getLogger("Author") + if debug: + self.logger.setLevel(DEBUG) + self.name = name + self.fs = FigShare(rate_limit_delay=rate_limit_delay, max_retries=max_retries) + self.articles = {} + self.public_html_prefix = "https://repository.lincoln.ac.uk" + self.df = None + + def save(self, filename=None): + if filename is None: + filename = f"{self.name}.db" + with shelve.open(filename) as db: + db['articles'] = self.articles + db['df'] = self.df + + def load(self, filename=None): + if filename is None: + filename = f"{self.name}.db" + with shelve.open(filename) as db: + self.articles = db['articles'] + self.df = db['df'] + + + def _retrieve_figshare(self, use_cache=True): + self.logger.info(f"retrieving articles for {self.name}") + self.articles = self.fs.articles_by_user_name(self.name, use_cache=use_cache) + + self.logger.info(f"found {len(self.articles)} articles for {self.name}") + + def _retrieve_details(self, use_cache=True): + for article in self.articles: + self.logger.info(f"retrieving details for article {article['id']}") + article['details'] = self.fs.get_article(article['id'], use_cache=use_cache) + + def _remove_non_repository(self): + self.logger.info(f"removing non-repository articles out of {len(self.articles)}") + self.articles = [a for a in self.articles if a['url_public_html'].startswith(self.public_html_prefix)] + self.logger.info(f"retained {len(self.articles)} articles") + + def _custom_fields_to_dicts(self): + for article in self.articles: + if 'details' not in article: + continue + if 'custom_fields' not in article['details']: + continue + self.logger.debug(f"convert") + + cf = article['details']['custom_fields'] + if type(cf) == list: + new_cf = {} + for p in cf: + new_cf[p['name']] = p['value'] + article['details']['custom_fields'] = new_cf + + + def _guess_doi(self, article): + """ + Use crossref API to guess the DOI for an article based on the title and authors + """ + with shelve.open("crossref_cache.db") as cache: + if 'title' not in article or not article['title']: + self.logger.warning("No title found for article, can't guess DOI") + return None + + title = article['title'] + author = article['author'] + + if title in cache: + self.logger.info(f"Found DOI {cache[title]} in cache for title: {title}") + return cache[title] + + # Construct query URL for Crossref API + base_url = "https://api.crossref.org/works" + params = { + "query.query.bibliographic": f"{title}", + "query.author": f"{author}", + "sort": "relevance", + "rows": 10, # Get top 10 matches + "select": "DOI,title,author", + } + + try: + + self.logger.debug(f"Querying Crossref for title: {title}") + response = requests.get(base_url, params=params) + response.raise_for_status() + + # Check if response is valid and contains JSON + if response.ok and response.headers.get('Content-Type', '').lower().startswith('application/json') and response.text.strip(): + data = response.json() + else: + self.logger.warning(f"Received empty or invalid JSON response from Crossref API (status: {response.status_code})") + return None + + if data["message"]["total-results"] == 0: + self.logger.debug(f"No DOI found for: {title}") + return None + + # Get all matches and find the best one using fuzzy matching + items = data["message"]["items"] + if items: + self.logger.debug(f"Found {len(items)} potential matches for title: {title}") + + best_match = None + best_score = 0 + threshold = 0.8 # Minimum similarity score to accept a match + + for item in items: + if "title" in item and item["title"]: + item_title = item["title"][0] + # Calculate similarity score + score = SequenceMatcher(None, title.lower(), item_title.lower()).ratio() + self.logger.debug(f"==== '{title}' == '{item['title'][0]}'??? ==> {score:.2f}") + + if score > best_score: + best_score = score + best_match = item + + if best_match and best_score >= threshold: + doi = best_match.get("DOI") + authors_string = str(best_match.get("author", "")) + authors_last_name = article['author'].split()[-1] + + if doi and authors_last_name in authors_string: + self.logger.info(f"Found DOI {doi} for title: {title} (match score: {best_score:.2f})") + cache[title] = doi + return doi + else: + self.logger.warning(f"DOI found but author {authors_last_name} not in authors list or DOI missing") + else: + self.logger.warning(f"No good title match found. Best score was {best_score:.2f}, below threshold {threshold}") + self.logger.warning(f" '{title}' != '{best_match['title'][0]}' (score: {best_score:.2f})") + + return None + + except Exception as e: + self.logger.warning(f"Error guessing DOI: {e}") + + return None + + + def _retrieve_bibtex_from_dois(self): + if self.df is None: + self.logger.warning(f"no dataframe found for {self.name}, can't continue") + return + doi2bibber = doi2bib() + # iteratre over all rows in the dataframe self.df + for index, row in self.df.iterrows(): + doi = row['External DOI'] + # Check if DOI is in valid format + if doi and isinstance(doi, str): + # Basic DOI validation - should start with 10. followed by numbers/dots/hyphens + if not doi.startswith('10.') or not len(doi.split('/', 1)) == 2: + self.logger.warning(f"Invalid DOI format: {doi}, will try to guess") + doi = None + else: + self.logger.info(f"No DOI defined in record for article, will try to guess") + doi = None + if doi is None: + doi = self._guess_doi(row) + if doi is None: + self.logger.debug(f"Unable to guess DOI for article, no option left but to skip it") + continue + self.logger.info(f"Guessed DOI for article: {doi}, updating dataframe") + self.df.at[index, 'External DOI'] = doi + try: + bibtex = doi2bibber.get_bibtex_entry(doi) + # Update the dataframe with the bibtex information + if bibtex is not None: + self.df.at[index, 'bibtex'] = bibtex + self.df.at[index, 'bibtex_str'] = doi2bibber.entries_to_str([bibtex]) + self.logger.info(f"got bibtex for {doi}") + else: + self.logger.warning(f"Couldn't get bibtex for {doi}") + + except Exception as e: + self.logger.warning(f"Failed to get bibtex for {doi}: {e}") + + def _flatten(self): + new_articles = [] + for a in self.articles: + new_articles.append(flatten(a, reducer='path')) + self.articles = new_articles + + def retrieve(self, use_cache=True): + self._retrieve_figshare(use_cache=use_cache) + self._remove_non_repository() + self._retrieve_details(use_cache=True) + self._custom_fields_to_dicts() + self._flatten() + self._create_dataframe() + self._retrieve_bibtex_from_dois() + + def _create_dataframe(self): + if len(self.articles) == 0: + self.logger.warning(f"no articles found for {self.name}, can't create dataframe") + self.df = None + return + self.df = pd.DataFrame.from_dict(self.articles) + # add column with author name + self.df['author'] = self.name + # add column with online date (as datetime object) + self.df['online_date'] = pd.to_datetime(self.df['timeline/firstOnline'], utc=True) + # add column with online year + self.df['online_year'] = self.df['online_date'].apply( + lambda x: x.year + ) + # add column with external DOI, parsed from custom_fields + self.df['External DOI'] = self.df['details/custom_fields/External DOI'].apply( + lambda x: re.sub(r'^(?:https?://doi\.org/|doi:)', '', x[0], flags=re.IGNORECASE).replace('doi:','') + if isinstance(x, list) and len(x) > 0 else None + ) + + + + return self.df diff --git a/doi2bib.py b/doi2bib.py new file mode 100644 index 0000000..3233c3d --- /dev/null +++ b/doi2bib.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import urllib.request +import requests +import bibtexparser +from bibtexparser.bparser import BibTexParser +from bibtexparser.bibdatabase import BibDatabase +import shelve +from logging import getLogger, INFO + + +class doi2bib: + + def __init__(self): + self.bibtext_cache_file = "bibtext_cache" + self.shortdoi_cache_file = "shortdoi_cache" + self.logger = getLogger("doi2bib") + self.logger.setLevel(INFO) + + + def shorten(self, doi): + """ + Get the shortDOI for a DOI. Providing a cache dictionary will prevent + multiple API requests for the same DOI. + """ + with shelve.open(self.shortdoi_cache_file) as cache: + if doi in cache: + self.logger.debug(f"short doi for {doi} found in cache") + return cache[doi] + quoted_doi = urllib.request.quote(doi) + url = 'http://shortdoi.org/{}?format=json'.format(quoted_doi) + try: + response = requests.get(url) + # Check if response is valid and contains JSON + if response.ok and response.headers.get('Content-Type', '').lower().startswith('application/json') and response.text.strip(): + result = response.json() + short_doi = result['ShortDOI'] + else: + self.logger.warning(f"Received empty or invalid JSON response for {doi} from {url} (status: {response.status_code})") + return None + except Exception as e: + self.logger.warning(f"failed to get short doi for {doi}: {e}") + return None + self.logger.debug(f"short doi for {doi} is {short_doi}, caching it") + cache[doi] = short_doi + return short_doi + + def get_bibtext(self, doi): + """ + Use DOI Content Negotioation (http://crosscite.org/cn/) to retrieve a string + with the bibtex entry. + """ + with shelve.open(self.bibtext_cache_file) as cache: + if doi in cache: + self.logger.debug(f"bibtex for {doi} found in cache") + return cache[doi] + url = 'https://doi.org/' + urllib.request.quote(doi) + header = { + 'Accept': 'application/x-bibtex', + } + response = requests.get(url, headers=header) + if not response.ok: + self.logger.warning(f"failed to get bibtex for {doi}, status code {response.status_code}") + return "" + bibtext = response.text + if bibtext: + self.logger.debug(f"bibtex for {doi} found, caching it") + cache[doi] = bibtext + else: + self.logger.warning(f"failed to get bibtex for {doi}") + return bibtext + + def get_bibtex_entry(self, doi): + """ + Return a bibtexparser entry for a DOI + """ + bibtext = self.get_bibtext(doi) + if not bibtext: + return None + + short_doi = self.shorten(doi) + parser = BibTexParser() + parser.ignore_nonstandard_types = False + bibdb = bibtexparser.loads(bibtext, parser) + entry, = bibdb.entries + quoted_doi = urllib.request.quote(doi) + entry['link'] = 'https://doi.org/{}'.format(quoted_doi) + if 'author' in entry: + entry['author'] = ' and '.join(entry['author'].rstrip(';').split('; ')) + entry['ID'] = short_doi[3:] + return entry + + def entries_to_str(self, entries): + """ + Pass a list of bibtexparser entries and return a bibtex formatted string. + """ + db = BibDatabase() + db.entries = entries + return bibtexparser.dumps(db) diff --git a/figshare_api.py b/figshare_api.py new file mode 100644 index 0000000..cedea9d --- /dev/null +++ b/figshare_api.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from requests import get, post +import shelve +import time +import os +from logging import getLogger, INFO + + +class FigShare: + def __init__(self, page_size=100, rate_limit_delay=1.0, max_retries=5): + self.logger = getLogger("FigShare") + self.token = os.getenv('FIGSHARE_TOKEN') + if self.token: + self.logger.info("Figshare API: Using authenticated requests") + else: + self.logger.warning("Figshare API: No authentication token found - using anonymous requests (may hit rate limits or receive 403 errors)") + self.page_size = page_size + self.rate_limit_delay = rate_limit_delay + self.max_retries = max_retries + self.base_url = "https://api.figshare.com/v2" + + if self.rate_limit_delay > 0: + self.logger.info(f"Rate limiting enabled: {self.rate_limit_delay} second delay between API requests") + + # Use shelve for persistent caching + self.cache_file = "figshare_cache.db" + + with shelve.open(self.cache_file) as cache: + self.logger.info(f"Figshare API: Using cache file {self.cache_file} with {len(cache.keys())} entries") + for key in list(cache.keys()): + self.logger.debug(f" existing cache key: {key}") + + + def __init_params(self): + return { + "page_size": self.page_size + } + + def __handle_403_error(self, url, method="GET", response_text=""): + """Handle 403 Forbidden errors with helpful messages""" + if not self.token: + self.logger.error(f"403 Forbidden for {method} {self.base_url + url}: " + f"Authentication required. Set FIGSHARE_TOKEN environment variable. " + f"See README.md for instructions.") + else: + self.logger.error(f"403 Forbidden for {method} {self.base_url + url}: " + f"Token may be invalid or lack permissions. " + f"Check token in Figshare account settings.") + if response_text: + self.logger.error(f"Response text: {response_text}") + + def __get(self, url, params=None, use_cache=True): + hash_key = f"GET{url}{'?' + str(params) if params else ''}" + + with shelve.open(self.cache_file) as cache: + if hash_key in cache and use_cache: + self.logger.info(f"Cache hit for GET {url}") + return cache[hash_key] + + headers = { "Authorization": "token " + self.token } if self.token else {} + + # Retry logic for 403 errors + for attempt in range(self.max_retries): + response = get(self.base_url + url, headers=headers, params=params) + + # Handle 403 Forbidden errors with retry logic + if response.status_code == 403: + if attempt < self.max_retries - 1: + # Exponential backoff: 1s, 2s, 4s, 8s, 16s + wait_time = 2 ** attempt + self.logger.warning(f"403 Forbidden for GET {url} (attempt {attempt + 1}/{self.max_retries}), retrying in {wait_time}s...") + time.sleep(wait_time) + continue + else: + # Final attempt failed, log error and return + self.__handle_403_error(url, "GET", response.text) + return {} + + # Success - break out of retry loop + break + + # Rate limiting: sleep after each API request + if self.rate_limit_delay > 0: + time.sleep(self.rate_limit_delay) + + # Check if response is valid and contains JSON + if response.ok and response.headers.get('Content-Type', '').lower().startswith('application/json') and response.text.strip(): + result = response.json() + cache[hash_key] = result + self.logger.debug(f"Cached result for GET {url}") + return result + else: + self.logger.warning(f"Received empty or invalid JSON response for GET {self.base_url + url} (status: {response.status_code})") + return {} + + def __post(self, url, params=None, use_cache=True): + hash_key = f"POST{url}{'?' + str(params) if params else ''}" + + with shelve.open(self.cache_file) as cache: + if hash_key in cache and use_cache: + self.logger.debug(f"Cache hit for POST {url}") + return cache[hash_key] + + headers = { "Authorization": "token " + self.token } if self.token else {} + + # Retry logic for 403 errors + for attempt in range(self.max_retries): + response = post(self.base_url + url, headers=headers, json=params) + + # Handle 403 Forbidden errors with retry logic + if response.status_code == 403: + if attempt < self.max_retries - 1: + # Exponential backoff: 1s, 2s, 4s, 8s, 16s + wait_time = 2 ** attempt + self.logger.warning(f"403 Forbidden for POST {url} (attempt {attempt + 1}/{self.max_retries}), retrying in {wait_time}s...") + time.sleep(wait_time) + continue + else: + # Final attempt failed, log error and return + self.__handle_403_error(url, "POST", response.text) + return [] + + # Success - break out of retry loop + break + + # Rate limiting: sleep after each API request + if self.rate_limit_delay > 0: + time.sleep(self.rate_limit_delay) + + # Check if response is valid and contains JSON + if response.ok and response.headers.get('Content-Type', '').lower().startswith('application/json') and response.text.strip(): + result = response.json() + cache[hash_key] = result + self.logger.debug(f"Cached result for POST {url}") + return result + else: + self.logger.warning(f"Received empty or invalid JSON response for POST {self.base_url + url} (status: {response.status_code})") + return [] + + + def articles_by_user_name(self, user_name, use_cache=True): + params = self.__init_params() + params["search_for"] = f':author: \"{user_name}\"' + page = 1 + articles = [] + while True: + params["page"] = page + self.logger.info(f"retrieving page {page} for user {user_name}") + current_page_articles = self.__post("/articles/search", params=params, use_cache=use_cache) + page += 1 + if len(current_page_articles) == 0: + break + articles += current_page_articles + self.logger.info(f"found {len(articles)} articles for user {user_name}") + + return articles + + def get_article(self, article_id, use_cache=True): + return self.__get(f"/articles/{article_id}", use_cache=use_cache) diff --git a/figshare_bibtex.py b/figshare_bibtex.py new file mode 100644 index 0000000..5df8123 --- /dev/null +++ b/figshare_bibtex.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Script 2: Read CSV files and generate bibtex from article details + +This script: +1. Reads the deduplicated CSV file produced by figshare_fetch.py +2. Retrieves article details and DOIs +3. Generates bibtex entries for each article +4. Exports bibtex files +""" + +import pandas as pd +import bibtexparser +from bibtexparser.bibdatabase import BibDatabase +import argparse +import re +import requests +import shelve +from logging import getLogger, basicConfig, INFO, DEBUG +from difflib import SequenceMatcher + +from doi2bib import doi2bib + +basicConfig(level=INFO) +logger = getLogger(__name__) + + +def parse_args(): + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Read CSV file and generate bibtex entries from article details.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument('-i', '--input', type=str, default='figshare_articles.csv', + help='Input CSV filename (deduplicated articles from figshare_fetch.py)') + parser.add_argument('-o', '--output', type=str, default='lcas.bib', + help='Output bibtex filename') + parser.add_argument('--debug', action='store_true', + help='Enable debug logging') + + return parser.parse_args() + + +def guess_doi(article_row): + """ + Use crossref API to guess the DOI for an article based on the title and authors + """ + with shelve.open("crossref_cache.db") as cache: + if 'title' not in article_row or not article_row['title']: + logger.warning("No title found for article, can't guess DOI") + return None + + title = article_row['title'] + author = article_row['author'] + + if title in cache: + logger.info(f"Found DOI {cache[title]} in cache for title: {title}") + return cache[title] + + # Construct query URL for Crossref API + base_url = "https://api.crossref.org/works" + params = { + "query.query.bibliographic": f"{title}", + "query.author": f"{author}", + "sort": "relevance", + "rows": 10, # Get top 10 matches + "select": "DOI,title,author", + } + + try: + + logger.debug(f"Querying Crossref for title: {title}") + response = requests.get(base_url, params=params) + response.raise_for_status() + + # Check if response is valid and contains JSON + if response.ok and response.headers.get('Content-Type', '').lower().startswith('application/json') and response.text.strip(): + data = response.json() + else: + logger.warning(f"Received empty or invalid JSON response from Crossref API (status: {response.status_code})") + return None + + if data["message"]["total-results"] == 0: + logger.debug(f"No DOI found for: {title}") + return None + + # Get all matches and find the best one using fuzzy matching + items = data["message"]["items"] + if items: + logger.debug(f"Found {len(items)} potential matches for title: {title}") + + best_match = None + best_score = 0 + threshold = 0.8 # Minimum similarity score to accept a match + + for item in items: + if "title" in item and item["title"]: + item_title = item["title"][0] + # Calculate similarity score + score = SequenceMatcher(None, title.lower(), item_title.lower()).ratio() + logger.debug(f"==== '{title}' == '{item['title'][0]}'??? ==> {score:.2f}") + + if score > best_score: + best_score = score + best_match = item + + if best_match and best_score >= threshold: + doi = best_match.get("DOI") + authors_string = str(best_match.get("author", "")) + authors_last_name = author.split()[-1] + + if doi and authors_last_name in authors_string: + logger.info(f"Found DOI {doi} for title: {title} (match score: {best_score:.2f})") + cache[title] = doi + return doi + else: + logger.warning(f"DOI found but author {authors_last_name} not in authors list or DOI missing") + else: + logger.warning(f"No good title match found. Best score was {best_score:.2f}, below threshold {threshold}") + if best_match: + logger.warning(f" '{title}' != '{best_match['title'][0]}' (score: {best_score:.2f})") + + return None + + except Exception as e: + logger.warning(f"Error guessing DOI: {e}") + + return None + + +def retrieve_bibtex_from_dois(df): + """ + Retrieve bibtex entries for all articles in the dataframe + """ + if df is None or len(df) == 0: + logger.warning(f"no dataframe provided, can't continue") + return df + + doi2bibber = doi2bib() + + # Add bibtex columns if they don't exist + if 'bibtex' not in df.columns: + df['bibtex'] = None + if 'bibtex_str' not in df.columns: + df['bibtex_str'] = None + + # Iterate over all rows in the dataframe + for index, row in df.iterrows(): + doi = row['External DOI'] if 'External DOI' in row else None + + # Check if DOI is in valid format + if doi and isinstance(doi, str): + # Basic DOI validation - should start with 10. followed by numbers/dots/hyphens + if not doi.startswith('10.') or not len(doi.split('/', 1)) == 2: + logger.warning(f"Invalid DOI format: {doi}, will try to guess") + doi = None + else: + logger.info(f"No DOI defined in record for article, will try to guess") + doi = None + + if doi is None: + doi = guess_doi(row) + if doi is None: + logger.debug(f"Unable to guess DOI for article, no option left but to skip it") + continue + logger.info(f"Guessed DOI for article: {doi}, updating dataframe") + df.at[index, 'External DOI'] = doi + + try: + bibtex = doi2bibber.get_bibtex_entry(doi) + # Update the dataframe with the bibtex information + if bibtex is not None: + df.at[index, 'bibtex'] = bibtex + df.at[index, 'bibtex_str'] = doi2bibber.entries_to_str([bibtex]) + logger.info(f"got bibtex for {doi}") + else: + logger.warning(f"Couldn't get bibtex for {doi}") + + except Exception as e: + logger.warning(f"Failed to get bibtex for {doi}: {e}") + + return df + + +def figshare_bibtex(): + """ + Read CSV file and generate bibtex entries from article details. + + This function: + 1. Reads the deduplicated CSV file + 2. Retrieves bibtex for each article based on DOI + 3. Exports bibtex file + """ + args = parse_args() + + if args.debug: + logger.setLevel(DEBUG) + + # Check if input file exists + import os + if not os.path.exists(args.input): + logger.error(f"Input file {args.input} not found. Please run figshare_fetch.py first.") + return + + logger.info(f"Reading articles from {args.input}") + df = pd.read_csv(args.input, encoding='utf-8') + logger.info(f"Loaded {len(df)} articles from CSV") + + # Retrieve bibtex for all articles + logger.info("Retrieving bibtex entries for all articles...") + df = retrieve_bibtex_from_dois(df) + + # Export bibtex file + bibtex_filename = args.output + bibtex = BibDatabase() + bibtex.entries = [entry for entry in df['bibtex'].tolist() if isinstance(entry, dict)] + + # Process all entries in the bibtex database and remove any duplicates based on ID + unique_entries = {} + for entry in bibtex.entries: + if entry and 'ID' in entry: + # Use ID as the key to avoid duplicates + unique_entries[entry['ID']] = entry + else: + logger.debug(f"Skipping entry without ID: {entry}") + + logger.info(f"Reduced from {len(bibtex.entries)} to {len(unique_entries)} unique bibtex entries") + + # Replace the entries with the unique ones + bibtex.entries = list(unique_entries.values()) + + with open(bibtex_filename, 'w') as f: + f.write(bibtexparser.dumps(bibtex)) + + logger.info(f"Saved {len(unique_entries)} bibtex entries to {bibtex_filename}") + logger.info("Bibtex generation complete") + + +if __name__ == "__main__": + figshare_bibtex() diff --git a/figshare_fetch.py b/figshare_fetch.py new file mode 100644 index 0000000..37c5c56 --- /dev/null +++ b/figshare_fetch.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Script 1: Fetch articles from Figshare API and create CSV files + +This script: +1. Retrieves publication data for each author from FigShare API +2. Processes and flattens the article data +3. Creates deduplicated CSV files +4. Does NOT retrieve bibtex (handled by script 2) +""" + +import pandas as pd +import os +import argparse +from logging import getLogger, basicConfig, INFO, DEBUG + +from author import Author + +basicConfig(level=INFO) +logger = getLogger(__name__) + + +def parse_args(): + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Fetch publications from FigShare repository for specified authors and create CSV files.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument('-a', '--authors', nargs='+', + help='List of author names to process (uses default list if not specified)') + parser.add_argument('-f', '--authors-file', type=str, + help='Path to file containing list of authors, one per line (uses default list if not specified)') + parser.add_argument('-o', '--output', type=str, default='figshare_articles.csv', + help='Output CSV filename for publications, without duplicates') + parser.add_argument('-O', '--output-all', type=str, default='figshare_articles_all.csv', + help='Output CSV filename for all publications by authors (includes duplicates when multiple authors per output)') + parser.add_argument('--use-author-cache', action='store_true', + help='Use cached author data instead of refreshing from API') + parser.add_argument('--rate-limit-delay', type=float, default=1.0, + help='Delay in seconds between Figshare API requests (default: 1.0)') + parser.add_argument('--max-retries', type=int, default=1, + help='Maximum number of retry attempts for 403 errors (default: 1)') + parser.add_argument('--debug', action='store_true', + help='Enable debug logging') + + return parser.parse_args() + + +def load_authors_from_file(filename): + """Load author names from a file, one per line.""" + try: + with open(filename, 'r') as f: + return [line.strip() for line in f if line.strip()] + except Exception as e: + logger.error(f"Error loading authors from file {filename}: {e}") + return [] + + +def figshare_fetch(): + """ + Fetch FigShare publications for specified authors and create CSV files. + + This function: + 1. Retrieves publication data for each author from FigShare + 2. Combines all publications into a single dataset + 3. Removes duplicates while preserving author information + 4. Exports results to CSV files (without bibtex generation) + """ + args = parse_args() + + if args.debug: + logger.setLevel(DEBUG) + + # Get list of authors + authors_list = [] + if args.authors: + authors_list.extend(args.authors) + if args.authors_file: + authors_list.extend(load_authors_from_file(args.authors_file)) + + # Use default authors if none specified + if not authors_list: + authors_list = [ + "Marc Hanheide", "Marcello Calisti", "Grzegorz Cielniak", + "Simon Parsons", "Elizabeth Sklar", "Paul Baxter", + "Petra Bosilj", "Heriberto Cuayahuitl", "Gautham Das", + "Francesco Del Duchetto", "Charles Fox", "Leonardo Guevara", + "Helen Harman", "Mohammed Al-Khafajiy", "Alexandr Klimchik", + "Riccardo Polvara", "Athanasios Polydoros", "Zied Tayeb", + "Sepehr Maleki", "Junfeng Gao", "Tom Duckett", "Mini Rai", + "Amir Ghalamzan Esfahani" + ] + logger.info(f"Using default list of {len(authors_list)} authors") + else: + logger.info(f"Processing {len(authors_list)} authors from command line/file") + + authors = {} + df_all = None + + for author_name in authors_list: + logger.info(f"*** Processing {author_name}...") + + authors[author_name] = Author(author_name, debug=args.debug, rate_limit_delay=args.rate_limit_delay, max_retries=args.max_retries) + cache_exists = os.path.exists(f"{author_name}.db") + + if cache_exists and args.use_author_cache: + logger.info(f"Loading cached data for {author_name}") + authors[author_name].load() + else: + logger.info(f"Retrieving data for {author_name}") + # Call retrieve WITHOUT bibtex generation + authors[author_name]._retrieve_figshare(use_cache=args.use_author_cache) + authors[author_name]._remove_non_repository() + authors[author_name]._retrieve_details(use_cache=True) + authors[author_name]._custom_fields_to_dicts() + authors[author_name]._flatten() + authors[author_name]._create_dataframe() + # Note: NOT calling _retrieve_bibtex_from_dois() here - that's for script 2 + authors[author_name].save() + + if authors[author_name].df is not None: + if df_all is None: + df_all = authors[author_name].df + else: + df_all = pd.concat([df_all, authors[author_name].df]) + + # Save individual author CSV + authors[author_name].df.to_csv(f"{author_name}.csv", index=False, encoding='utf-8') + logger.info(f"Saved {len(authors[author_name].df)} articles for {author_name} to {author_name}.csv") + else: + logger.warning(f"No data found for {author_name}") + + if df_all is None or len(df_all) == 0: + logger.error("No publication data found. Exiting.") + return + + logger.info(f"Total number of articles before deduplication: {len(df_all)}") + + # Group by ID and aggregate authors into lists + grouped = df_all.groupby('id').agg({ + 'author': lambda x: list(set(x)) # Use set to remove duplicate authors + }) + + # Filter the original dataframe to keep only one row per ID + deduplicated_df = df_all.drop_duplicates(subset=['id'], keep='first') + + # Add the aggregated authors list as a new column + deduplicated_df = deduplicated_df.set_index('id') + deduplicated_df['authors'] = grouped['author'] + deduplicated_df = deduplicated_df.reset_index() + + # Convert authors list to comma-separated string + deduplicated_df['authors'] = deduplicated_df['authors'].apply(lambda authors: ', '.join(authors)) + + logger.info(f"Total number of articles after deduplication: {len(deduplicated_df)}") + + # Save deduplicated data to CSV + deduplicated_df.to_csv(args.output, index=False, encoding='utf-8') + logger.info(f"Saved deduplicated articles to {args.output}") + + # Save all data to CSV + df_all.to_csv(args.output_all, index=False, encoding='utf-8') + logger.info(f"Saved all articles to {args.output_all}") + + logger.info("Fetch processing complete - CSV files created successfully") + logger.info(f"Next step: Run figshare_bibtex.py to generate bibtex from {args.output}") + + +if __name__ == "__main__": + figshare_fetch() From d42d77db5b3c3ba0737d771af9b6d38d436ba8ef Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 18:30:28 +0000 Subject: [PATCH 3/5] Make scripts executable Co-authored-by: marc-hanheide <1153084+marc-hanheide@users.noreply.github.com> --- figshare_bibtex.py | 0 figshare_fetch.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 figshare_bibtex.py mode change 100644 => 100755 figshare_fetch.py diff --git a/figshare_bibtex.py b/figshare_bibtex.py old mode 100644 new mode 100755 diff --git a/figshare_fetch.py b/figshare_fetch.py old mode 100644 new mode 100755 From 536cbb3f53b884dcfffa84c92687d808fce0c6eb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 18:33:46 +0000 Subject: [PATCH 4/5] Fix code review issues: IndexError, spelling, and code duplication Co-authored-by: marc-hanheide <1153084+marc-hanheide@users.noreply.github.com> --- author.py | 91 ++++----------------------------------- doi2bib.py | 2 +- doi_utils.py | 103 +++++++++++++++++++++++++++++++++++++++++++++ figshare_bibtex.py | 91 ++++----------------------------------- 4 files changed, 121 insertions(+), 166 deletions(-) create mode 100644 doi_utils.py diff --git a/author.py b/author.py index e94056f..d4813b7 100644 --- a/author.py +++ b/author.py @@ -4,13 +4,12 @@ import pandas as pd import shelve import re -import requests from logging import getLogger, INFO, DEBUG from flatten_dict import flatten -from difflib import SequenceMatcher from figshare_api import FigShare from doi2bib import doi2bib +from doi_utils import guess_doi_from_crossref class Author: @@ -75,86 +74,14 @@ def _guess_doi(self, article): """ Use crossref API to guess the DOI for an article based on the title and authors """ - with shelve.open("crossref_cache.db") as cache: - if 'title' not in article or not article['title']: - self.logger.warning("No title found for article, can't guess DOI") - return None - - title = article['title'] - author = article['author'] - - if title in cache: - self.logger.info(f"Found DOI {cache[title]} in cache for title: {title}") - return cache[title] - - # Construct query URL for Crossref API - base_url = "https://api.crossref.org/works" - params = { - "query.query.bibliographic": f"{title}", - "query.author": f"{author}", - "sort": "relevance", - "rows": 10, # Get top 10 matches - "select": "DOI,title,author", - } - - try: - - self.logger.debug(f"Querying Crossref for title: {title}") - response = requests.get(base_url, params=params) - response.raise_for_status() - - # Check if response is valid and contains JSON - if response.ok and response.headers.get('Content-Type', '').lower().startswith('application/json') and response.text.strip(): - data = response.json() - else: - self.logger.warning(f"Received empty or invalid JSON response from Crossref API (status: {response.status_code})") - return None - - if data["message"]["total-results"] == 0: - self.logger.debug(f"No DOI found for: {title}") - return None - - # Get all matches and find the best one using fuzzy matching - items = data["message"]["items"] - if items: - self.logger.debug(f"Found {len(items)} potential matches for title: {title}") - - best_match = None - best_score = 0 - threshold = 0.8 # Minimum similarity score to accept a match - - for item in items: - if "title" in item and item["title"]: - item_title = item["title"][0] - # Calculate similarity score - score = SequenceMatcher(None, title.lower(), item_title.lower()).ratio() - self.logger.debug(f"==== '{title}' == '{item['title'][0]}'??? ==> {score:.2f}") - - if score > best_score: - best_score = score - best_match = item - - if best_match and best_score >= threshold: - doi = best_match.get("DOI") - authors_string = str(best_match.get("author", "")) - authors_last_name = article['author'].split()[-1] - - if doi and authors_last_name in authors_string: - self.logger.info(f"Found DOI {doi} for title: {title} (match score: {best_score:.2f})") - cache[title] = doi - return doi - else: - self.logger.warning(f"DOI found but author {authors_last_name} not in authors list or DOI missing") - else: - self.logger.warning(f"No good title match found. Best score was {best_score:.2f}, below threshold {threshold}") - self.logger.warning(f" '{title}' != '{best_match['title'][0]}' (score: {best_score:.2f})") - - return None - - except Exception as e: - self.logger.warning(f"Error guessing DOI: {e}") - + if 'title' not in article or not article['title']: + self.logger.warning("No title found for article, can't guess DOI") return None + + title = article['title'] + author = article['author'] + + return guess_doi_from_crossref(title, author) def _retrieve_bibtex_from_dois(self): @@ -162,7 +89,7 @@ def _retrieve_bibtex_from_dois(self): self.logger.warning(f"no dataframe found for {self.name}, can't continue") return doi2bibber = doi2bib() - # iteratre over all rows in the dataframe self.df + # iterate over all rows in the dataframe self.df for index, row in self.df.iterrows(): doi = row['External DOI'] # Check if DOI is in valid format diff --git a/doi2bib.py b/doi2bib.py index 3233c3d..ce89b5a 100644 --- a/doi2bib.py +++ b/doi2bib.py @@ -48,7 +48,7 @@ def shorten(self, doi): def get_bibtext(self, doi): """ - Use DOI Content Negotioation (http://crosscite.org/cn/) to retrieve a string + Use DOI Content Negotiation (http://crosscite.org/cn/) to retrieve a string with the bibtex entry. """ with shelve.open(self.bibtext_cache_file) as cache: diff --git a/doi_utils.py b/doi_utils.py new file mode 100644 index 0000000..6e804a8 --- /dev/null +++ b/doi_utils.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Utility functions for DOI guessing and retrieval +""" + +import requests +import shelve +from logging import getLogger +from difflib import SequenceMatcher + +logger = getLogger("doi_utils") + + +def guess_doi_from_crossref(title, author): + """ + Use crossref API to guess the DOI for an article based on the title and authors + + Args: + title: Article title + author: Author name + + Returns: + DOI string if found, None otherwise + """ + with shelve.open("crossref_cache.db") as cache: + if not title: + logger.warning("No title found for article, can't guess DOI") + return None + + if title in cache: + logger.info(f"Found DOI {cache[title]} in cache for title: {title}") + return cache[title] + + # Construct query URL for Crossref API + base_url = "https://api.crossref.org/works" + params = { + "query.query.bibliographic": f"{title}", + "query.author": f"{author}", + "sort": "relevance", + "rows": 10, # Get top 10 matches + "select": "DOI,title,author", + } + + try: + + logger.debug(f"Querying Crossref for title: {title}") + response = requests.get(base_url, params=params) + response.raise_for_status() + + # Check if response is valid and contains JSON + if response.ok and response.headers.get('Content-Type', '').lower().startswith('application/json') and response.text.strip(): + data = response.json() + else: + logger.warning(f"Received empty or invalid JSON response from Crossref API (status: {response.status_code})") + return None + + if data["message"]["total-results"] == 0: + logger.debug(f"No DOI found for: {title}") + return None + + # Get all matches and find the best one using fuzzy matching + items = data["message"]["items"] + if items: + logger.debug(f"Found {len(items)} potential matches for title: {title}") + + best_match = None + best_score = 0 + threshold = 0.8 # Minimum similarity score to accept a match + + for item in items: + if "title" in item and item["title"]: + item_title = item["title"][0] + # Calculate similarity score + score = SequenceMatcher(None, title.lower(), item_title.lower()).ratio() + logger.debug(f"==== '{title}' == '{item['title'][0]}'??? ==> {score:.2f}") + + if score > best_score: + best_score = score + best_match = item + + if best_match and best_score >= threshold: + doi = best_match.get("DOI") + authors_string = str(best_match.get("author", "")) + authors_last_name = author.split()[-1] + + if doi and authors_last_name in authors_string: + logger.info(f"Found DOI {doi} for title: {title} (match score: {best_score:.2f})") + cache[title] = doi + return doi + else: + logger.warning(f"DOI found but author {authors_last_name} not in authors list or DOI missing") + else: + logger.warning(f"No good title match found. Best score was {best_score:.2f}, below threshold {threshold}") + if best_match and 'title' in best_match: + logger.warning(f" '{title}' != '{best_match['title'][0]}' (score: {best_score:.2f})") + + return None + + except Exception as e: + logger.warning(f"Error guessing DOI: {e}") + + return None diff --git a/figshare_bibtex.py b/figshare_bibtex.py index 5df8123..15d968a 100755 --- a/figshare_bibtex.py +++ b/figshare_bibtex.py @@ -15,12 +15,10 @@ from bibtexparser.bibdatabase import BibDatabase import argparse import re -import requests -import shelve from logging import getLogger, basicConfig, INFO, DEBUG -from difflib import SequenceMatcher from doi2bib import doi2bib +from doi_utils import guess_doi_from_crossref basicConfig(level=INFO) logger = getLogger(__name__) @@ -46,87 +44,14 @@ def guess_doi(article_row): """ Use crossref API to guess the DOI for an article based on the title and authors """ - with shelve.open("crossref_cache.db") as cache: - if 'title' not in article_row or not article_row['title']: - logger.warning("No title found for article, can't guess DOI") - return None - - title = article_row['title'] - author = article_row['author'] - - if title in cache: - logger.info(f"Found DOI {cache[title]} in cache for title: {title}") - return cache[title] - - # Construct query URL for Crossref API - base_url = "https://api.crossref.org/works" - params = { - "query.query.bibliographic": f"{title}", - "query.author": f"{author}", - "sort": "relevance", - "rows": 10, # Get top 10 matches - "select": "DOI,title,author", - } - - try: - - logger.debug(f"Querying Crossref for title: {title}") - response = requests.get(base_url, params=params) - response.raise_for_status() - - # Check if response is valid and contains JSON - if response.ok and response.headers.get('Content-Type', '').lower().startswith('application/json') and response.text.strip(): - data = response.json() - else: - logger.warning(f"Received empty or invalid JSON response from Crossref API (status: {response.status_code})") - return None - - if data["message"]["total-results"] == 0: - logger.debug(f"No DOI found for: {title}") - return None - - # Get all matches and find the best one using fuzzy matching - items = data["message"]["items"] - if items: - logger.debug(f"Found {len(items)} potential matches for title: {title}") - - best_match = None - best_score = 0 - threshold = 0.8 # Minimum similarity score to accept a match - - for item in items: - if "title" in item and item["title"]: - item_title = item["title"][0] - # Calculate similarity score - score = SequenceMatcher(None, title.lower(), item_title.lower()).ratio() - logger.debug(f"==== '{title}' == '{item['title'][0]}'??? ==> {score:.2f}") - - if score > best_score: - best_score = score - best_match = item - - if best_match and best_score >= threshold: - doi = best_match.get("DOI") - authors_string = str(best_match.get("author", "")) - authors_last_name = author.split()[-1] - - if doi and authors_last_name in authors_string: - logger.info(f"Found DOI {doi} for title: {title} (match score: {best_score:.2f})") - cache[title] = doi - return doi - else: - logger.warning(f"DOI found but author {authors_last_name} not in authors list or DOI missing") - else: - logger.warning(f"No good title match found. Best score was {best_score:.2f}, below threshold {threshold}") - if best_match: - logger.warning(f" '{title}' != '{best_match['title'][0]}' (score: {best_score:.2f})") - - return None - - except Exception as e: - logger.warning(f"Error guessing DOI: {e}") - + if 'title' not in article_row or not article_row['title']: + logger.warning("No title found for article, can't guess DOI") return None + + title = article_row['title'] + author = article_row['author'] + + return guess_doi_from_crossref(title, author) def retrieve_bibtex_from_dois(df): From f2387065c8af6c382574bdc6e12b21783b6aadab Mon Sep 17 00:00:00 2001 From: Marc Hanheide Date: Tue, 23 Dec 2025 18:46:59 +0000 Subject: [PATCH 5/5] Refactor figshare_fetch.py to separate data retrieval phases and improve rate limiting handling --- .github/workflows/figshare-processing.yaml | 2 +- figshare.py | 699 --------------------- figshare_fetch.py | 18 +- 3 files changed, 16 insertions(+), 703 deletions(-) delete mode 100644 figshare.py diff --git a/.github/workflows/figshare-processing.yaml b/.github/workflows/figshare-processing.yaml index dd45b4d..5bdd1a9 100644 --- a/.github/workflows/figshare-processing.yaml +++ b/.github/workflows/figshare-processing.yaml @@ -70,7 +70,7 @@ jobs: python ../figshare_fetch.py --use-author-cache else echo "Running figshare_fetch.py without cache (default behavior)" - python ../figshare_fetch.py --rate-limit-delay 3 + python ../figshare_fetch.py --rate-limit-delay 1 --max-retries 30 fi - name: Run figshare bibtex (Step 2 - Generate bibtex from CSV) diff --git a/figshare.py b/figshare.py deleted file mode 100644 index 269068d..0000000 --- a/figshare.py +++ /dev/null @@ -1,699 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -from requests import get, post -from json import loads -from pprint import pformat -import pandas as pd - -from logging import getLogger, basicConfig, INFO, DEBUG -import os - -from flatten_dict import flatten - - -import urllib.request - -import requests -import bibtexparser -from bibtexparser.bparser import BibTexParser -from bibtexparser.bibdatabase import BibDatabase - -import shelve -import re -import argparse -from datetime import datetime -from difflib import SequenceMatcher -import time - - -basicConfig(level=INFO) -logger = getLogger(__name__) - -class doi2bib: - - def __init__(self): - self.bibtext_cache_file = "bibtext_cache" - self.shortdoi_cache_file = "shortdoi_cache" - self.logger = getLogger("doi2bib") - self.logger.setLevel(INFO) - - - def shorten(self, doi): - """ - Get the shortDOI for a DOI. Providing a cache dictionary will prevent - multiple API requests for the same DOI. - """ - with shelve.open(self.shortdoi_cache_file) as cache: - if doi in cache: - self.logger.debug(f"short doi for {doi} found in cache") - return cache[doi] - quoted_doi = urllib.request.quote(doi) - url = 'http://shortdoi.org/{}?format=json'.format(quoted_doi) - try: - response = requests.get(url) - # Check if response is valid and contains JSON - if response.ok and response.headers.get('Content-Type', '').lower().startswith('application/json') and response.text.strip(): - result = response.json() - short_doi = result['ShortDOI'] - else: - self.logger.warning(f"Received empty or invalid JSON response for {doi} from {url} (status: {response.status_code})") - return None - except Exception as e: - self.logger.warning(f"failed to get short doi for {doi}: {e}") - return None - self.logger.debug(f"short doi for {doi} is {short_doi}, caching it") - cache[doi] = short_doi - return short_doi - - def get_bibtext(self, doi): - """ - Use DOI Content Negotioation (http://crosscite.org/cn/) to retrieve a string - with the bibtex entry. - """ - with shelve.open(self.bibtext_cache_file) as cache: - if doi in cache: - self.logger.debug(f"bibtex for {doi} found in cache") - return cache[doi] - url = 'https://doi.org/' + urllib.request.quote(doi) - header = { - 'Accept': 'application/x-bibtex', - } - response = requests.get(url, headers=header) - if not response.ok: - self.logger.warning(f"failed to get bibtex for {doi}, status code {response.status_code}") - return "" - bibtext = response.text - if bibtext: - self.logger.debug(f"bibtex for {doi} found, caching it") - cache[doi] = bibtext - else: - self.logger.warning(f"failed to get bibtex for {doi}") - return bibtext - - def get_bibtex_entry(self, doi): - """ - Return a bibtexparser entry for a DOI - """ - bibtext = self.get_bibtext(doi) - if not bibtext: - return None - - short_doi = self.shorten(doi) - parser = BibTexParser() - parser.ignore_nonstandard_types = False - bibdb = bibtexparser.loads(bibtext, parser) - entry, = bibdb.entries - quoted_doi = urllib.request.quote(doi) - entry['link'] = 'https://doi.org/{}'.format(quoted_doi) - if 'author' in entry: - entry['author'] = ' and '.join(entry['author'].rstrip(';').split('; ')) - entry['ID'] = short_doi[3:] - return entry - - def entries_to_str(self, entries): - """ - Pass a list of bibtexparser entries and return a bibtex formatted string. - """ - db = BibDatabase() - db.entries = entries - return bibtexparser.dumps(db) - - -class FigShare: - def __init__(self, page_size=100, rate_limit_delay=1.0, max_retries=5): - self.logger = getLogger("FigShare") - self.token = os.getenv('FIGSHARE_TOKEN') - if self.token: - self.logger.info("Figshare API: Using authenticated requests") - else: - self.logger.warning("Figshare API: No authentication token found - using anonymous requests (may hit rate limits or receive 403 errors)") - self.page_size = page_size - self.rate_limit_delay = rate_limit_delay - self.max_retries = max_retries - self.base_url = "https://api.figshare.com/v2" - - if self.rate_limit_delay > 0: - self.logger.info(f"Rate limiting enabled: {self.rate_limit_delay} second delay between API requests") - - # Use shelve for persistent caching - self.cache_file = "figshare_cache.db" - - with shelve.open(self.cache_file) as cache: - self.logger.info(f"Figshare API: Using cache file {self.cache_file} with {len(cache.keys())} entries") - for key in list(cache.keys()): - self.logger.debug(f" existing cache key: {key}") - - - def __init_params(self): - return { - "page_size": self.page_size - } - - def __handle_403_error(self, url, method="GET", response_text=""): - """Handle 403 Forbidden errors with helpful messages""" - if not self.token: - self.logger.error(f"403 Forbidden for {method} {self.base_url + url}: " - f"Authentication required. Set FIGSHARE_TOKEN environment variable. " - f"See README.md for instructions.") - else: - self.logger.error(f"403 Forbidden for {method} {self.base_url + url}: " - f"Token may be invalid or lack permissions. " - f"Check token in Figshare account settings.") - if response_text: - self.logger.error(f"Response text: {response_text}") - - def __get(self, url, params=None, use_cache=True): - hash_key = f"GET{url}{'?' + str(params) if params else ''}" - - with shelve.open(self.cache_file) as cache: - if hash_key in cache and use_cache: - self.logger.info(f"Cache hit for GET {url}") - return cache[hash_key] - - headers = { "Authorization": "token " + self.token } if self.token else {} - - # Retry logic for 403 errors - for attempt in range(self.max_retries): - response = get(self.base_url + url, headers=headers, params=params) - - # Handle 403 Forbidden errors with retry logic - if response.status_code == 403: - if attempt < self.max_retries - 1: - # Exponential backoff: 1s, 2s, 4s, 8s, 16s - wait_time = 2 ** attempt - self.logger.warning(f"403 Forbidden for GET {url} (attempt {attempt + 1}/{self.max_retries}), retrying in {wait_time}s...") - time.sleep(wait_time) - continue - else: - # Final attempt failed, log error and return - self.__handle_403_error(url, "GET", response.text) - return {} - - # Success - break out of retry loop - break - - # Rate limiting: sleep after each API request - if self.rate_limit_delay > 0: - time.sleep(self.rate_limit_delay) - - # Check if response is valid and contains JSON - if response.ok and response.headers.get('Content-Type', '').lower().startswith('application/json') and response.text.strip(): - result = response.json() - cache[hash_key] = result - self.logger.debug(f"Cached result for GET {url}") - return result - else: - self.logger.warning(f"Received empty or invalid JSON response for GET {self.base_url + url} (status: {response.status_code})") - return {} - - def __post(self, url, params=None, use_cache=True): - hash_key = f"POST{url}{'?' + str(params) if params else ''}" - - with shelve.open(self.cache_file) as cache: - if hash_key in cache and use_cache: - self.logger.debug(f"Cache hit for POST {url}") - return cache[hash_key] - - headers = { "Authorization": "token " + self.token } if self.token else {} - - # Retry logic for 403 errors - for attempt in range(self.max_retries): - response = post(self.base_url + url, headers=headers, json=params) - - # Handle 403 Forbidden errors with retry logic - if response.status_code == 403: - if attempt < self.max_retries - 1: - # Exponential backoff: 1s, 2s, 4s, 8s, 16s - wait_time = 2 ** attempt - self.logger.warning(f"403 Forbidden for POST {url} (attempt {attempt + 1}/{self.max_retries}), retrying in {wait_time}s...") - time.sleep(wait_time) - continue - else: - # Final attempt failed, log error and return - self.__handle_403_error(url, "POST", response.text) - return [] - - # Success - break out of retry loop - break - - # Rate limiting: sleep after each API request - if self.rate_limit_delay > 0: - time.sleep(self.rate_limit_delay) - - # Check if response is valid and contains JSON - if response.ok and response.headers.get('Content-Type', '').lower().startswith('application/json') and response.text.strip(): - result = response.json() - cache[hash_key] = result - self.logger.debug(f"Cached result for POST {url}") - return result - else: - self.logger.warning(f"Received empty or invalid JSON response for POST {self.base_url + url} (status: {response.status_code})") - return [] - - - def articles_by_user_name(self, user_name, use_cache=True): - params = self.__init_params() - params["search_for"] = f':author: \"{user_name}\"' - page = 1 - articles = [] - while True: - params["page"] = page - self.logger.info(f"retrieving page {page} for user {user_name}") - current_page_articles = self.__post("/articles/search", params=params, use_cache=use_cache) - page += 1 - if len(current_page_articles) == 0: - break - articles += current_page_articles - self.logger.info(f"found {len(articles)} articles for user {user_name}") - - return articles - - def get_article(self, article_id, use_cache=True): - return self.__get(f"/articles/{article_id}", use_cache=use_cache) - -class Author: - def __init__(self, name, debug=False, rate_limit_delay=1.0, max_retries=5): - self.logger = getLogger("Author") - if debug: - self.logger.setLevel(DEBUG) - self.name = name - self.fs = FigShare(rate_limit_delay=rate_limit_delay, max_retries=max_retries) - self.articles = {} - self.public_html_prefix = "https://repository.lincoln.ac.uk" - self.df = None - - def save(self, filename=None): - if filename is None: - filename = f"{self.name}.db" - with shelve.open(filename) as db: - db['articles'] = self.articles - db['df'] = self.df - - def load(self, filename=None): - if filename is None: - filename = f"{self.name}.db" - with shelve.open(filename) as db: - self.articles = db['articles'] - self.df = db['df'] - - - def _retrieve_figshare(self, use_cache=True): - self.logger.info(f"retrieving articles for {self.name}") - self.articles = self.fs.articles_by_user_name(self.name, use_cache=use_cache) - - self.logger.info(f"found {len(self.articles)} articles for {self.name}") - - def _retrieve_details(self, use_cache=True): - for article in self.articles: - self.logger.info(f"retrieving details for article {article['id']}") - article['details'] = self.fs.get_article(article['id'], use_cache=use_cache) - - def _remove_non_repository(self): - self.logger.info(f"removing non-repository articles out of {len(self.articles)}") - self.articles = [a for a in self.articles if a['url_public_html'].startswith(self.public_html_prefix)] - self.logger.info(f"retained {len(self.articles)} articles") - - def _custom_fields_to_dicts(self): - for article in self.articles: - if 'details' not in article: - continue - if 'custom_fields' not in article['details']: - continue - self.logger.debug(f"convert") - - cf = article['details']['custom_fields'] - if type(cf) == list: - new_cf = {} - for p in cf: - new_cf[p['name']] = p['value'] - article['details']['custom_fields'] = new_cf - - - def _guess_doi(self, article): - """ - Use crossref API to guess the DOI for an article based on the title and authors - """ - with shelve.open("crossref_cache.db") as cache: - if 'title' not in article or not article['title']: - self.logger.warning("No title found for article, can't guess DOI") - return None - - title = article['title'] - author = article['author'] - - if title in cache: - self.logger.info(f"Found DOI {cache[title]} in cache for title: {title}") - return cache[title] - - # Construct query URL for Crossref API - base_url = "https://api.crossref.org/works" - params = { - "query.query.bibliographic": f"{title}", - "query.author": f"{author}", - "sort": "relevance", - "rows": 10, # Get top 10 matches - "select": "DOI,title,author", - } - - try: - - self.logger.debug(f"Querying Crossref for title: {title}") - response = requests.get(base_url, params=params) - response.raise_for_status() - - # Check if response is valid and contains JSON - if response.ok and response.headers.get('Content-Type', '').lower().startswith('application/json') and response.text.strip(): - data = response.json() - else: - self.logger.warning(f"Received empty or invalid JSON response from Crossref API (status: {response.status_code})") - return None - - if data["message"]["total-results"] == 0: - self.logger.debug(f"No DOI found for: {title}") - return None - - # Get all matches and find the best one using fuzzy matching - items = data["message"]["items"] - if items: - self.logger.debug(f"Found {len(items)} potential matches for title: {title}") - - best_match = None - best_score = 0 - threshold = 0.8 # Minimum similarity score to accept a match - - for item in items: - if "title" in item and item["title"]: - item_title = item["title"][0] - # Calculate similarity score - score = SequenceMatcher(None, title.lower(), item_title.lower()).ratio() - logger.debug(f"==== '{title}' == '{item['title'][0]}'??? ==> {score:.2f}") - - if score > best_score: - best_score = score - best_match = item - - if best_match and best_score >= threshold: - doi = best_match.get("DOI") - authors_string = str(best_match.get("author", "")) - authors_last_name = article['author'].split()[-1] - - if doi and authors_last_name in authors_string: - self.logger.info(f"Found DOI {doi} for title: {title} (match score: {best_score:.2f})") - cache[title] = doi - return doi - else: - self.logger.warning(f"DOI found but author {authors_last_name} not in authors list or DOI missing") - else: - self.logger.warning(f"No good title match found. Best score was {best_score:.2f}, below threshold {threshold}") - self.logger.warning(f" '{title}' != '{best_match['title'][0]}' (score: {best_score:.2f})") - - return None - - except Exception as e: - self.logger.warning(f"Error guessing DOI: {e}") - - return None - - - def _retrieve_bibtex_from_dois(self): - if self.df is None: - self.logger.warning(f"no dataframe found for {self.name}, can't continue") - return - doi2bibber = doi2bib() - # iteratre over all rows in the dataframe self.df - for index, row in self.df.iterrows(): - doi = row['External DOI'] - # Check if DOI is in valid format - if doi and isinstance(doi, str): - # Basic DOI validation - should start with 10. followed by numbers/dots/hyphens - if not doi.startswith('10.') or not len(doi.split('/', 1)) == 2: - self.logger.warning(f"Invalid DOI format: {doi}, will try to guess") - doi = None - else: - self.logger.info(f"No DOI defined in record for article, will try to guess") - doi = None - if doi is None: - doi = self._guess_doi(row) - if doi is None: - self.logger.debug(f"Unable to guess DOI for article, no option left but to skip it") - continue - self.logger.info(f"Guessed DOI for article: {doi}, updating dataframe") - self.df.at[index, 'External DOI'] = doi - try: - bibtex = doi2bibber.get_bibtex_entry(doi) - # Update the dataframe with the bibtex information - if bibtex is not None: - self.df.at[index, 'bibtex'] = bibtex - self.df.at[index, 'bibtex_str'] = doi2bibber.entries_to_str([bibtex]) - self.logger.info(f"got bibtex for {doi}") - else: - self.logger.warning(f"Couldn't get bibtex for {doi}") - - except Exception as e: - self.logger.warning(f"Failed to get bibtex for {doi}: {e}") - - def _flatten(self): - new_articles = [] - for a in self.articles: - new_articles.append(flatten(a, reducer='path')) - self.articles = new_articles - - def retrieve(self, use_cache=True): - self._retrieve_figshare(use_cache=use_cache) - self._remove_non_repository() - self._retrieve_details(use_cache=True) - self._custom_fields_to_dicts() - self._flatten() - self._create_dataframe() - self._retrieve_bibtex_from_dois() - - def _create_dataframe(self): - if len(self.articles) == 0: - self.logger.warning(f"no articles found for {self.name}, can't create dataframe") - self.df = None - return - self.df = pd.DataFrame.from_dict(self.articles) - # add column with author name - self.df['author'] = self.name - # add column with online date (as datetime object) - self.df['online_date'] = pd.to_datetime(self.df['timeline/firstOnline'], utc=True) - # add column with online year - self.df['online_year'] = self.df['online_date'].apply( - lambda x: x.year - ) - # add column with external DOI, parsed from custom_fields - self.df['External DOI'] = self.df['details/custom_fields/External DOI'].apply( - lambda x: re.sub(r'^(?:https?://doi\.org/|doi:)', '', x[0], flags=re.IGNORECASE).replace('doi:','') - if isinstance(x, list) and len(x) > 0 else None - ) - - - - return self.df - - -def doi2bibtex_test(): - doi = "10.1109/MRA.2023.3296983" - doi2bibber = doi2bib() - bibtex = doi2bibber.get_bibtex_entry(doi) - print(doi2bibber.entries_to_str([bibtex])) - - - -def parse_args(): - """Parse command-line arguments.""" - parser = argparse.ArgumentParser( - description="Process publications from FigShare repository for specified authors.", - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument('-a', '--authors', nargs='+', - help='List of author names to process (uses default list if not specified)') - parser.add_argument('-f', '--authors-file', type=str, - help='Path to file containing list of authors, one per line (uses default list if not specified)') - parser.add_argument('-s', '--since', type=str, default='2021-01-01', - help='Process only publications since this date (YYYY-MM-DD)') - parser.add_argument('-o', '--output', type=str, default='figshare_articles.csv', - help='Output CSV filename for publications, without duplicates') - parser.add_argument('-O', '--output-all', type=str, default='figshare_articles_all.csv', - help='Output CSV filename for all publications by authors (includes duplicates when multiple authors per output)') - # parser.add_argument('-r', '--recent-output', type=str, default='figshare_articles_recent.csv', - # help='Output CSV filename for publications since specified date') - parser.add_argument('--use-author-cache', action='store_true', - help='Use cached author data instead of refreshing from API') - parser.add_argument('--rate-limit-delay', type=float, default=1.0, - help='Delay in seconds between Figshare API requests (default: 1.0)') - parser.add_argument('--max-retries', type=int, default=1, - help='Maximum number of retry attempts for 403 errors (default: 1)') - parser.add_argument('--debug', action='store_true', - help='Enable debug logging') - - return parser.parse_args() - -def load_authors_from_file(filename): - """Load author names from a file, one per line.""" - try: - with open(filename, 'r') as f: - return [line.strip() for line in f if line.strip()] - except Exception as e: - logger.error(f"Error loading authors from file {filename}: {e}") - return [] - -def figshare_processing(): - """ - Process FigShare publications for specified authors. - - This function: - 1. Retrieves publication data for each author from FigShare - 2. Combines all publications into a single dataset - 3. Removes duplicates while preserving author information - 4. Filters publications by date if specified - 5. Exports results to CSV files - """ - args = parse_args() - - if args.debug: - logger.setLevel(DEBUG) - - # Get list of authors - authors_list = [] - if args.authors: - authors_list.extend(args.authors) - if args.authors_file: - authors_list.extend(load_authors_from_file(args.authors_file)) - - # Use default authors if none specified - if not authors_list: - authors_list = [ - "Marc Hanheide", "Marcello Calisti", "Grzegorz Cielniak", - "Simon Parsons", "Elizabeth Sklar", "Paul Baxter", - "Petra Bosilj", "Heriberto Cuayahuitl", "Gautham Das", - "Francesco Del Duchetto", "Charles Fox", "Leonardo Guevara", - "Helen Harman", "Mohammed Al-Khafajiy", "Alexandr Klimchik", - "Riccardo Polvara", "Athanasios Polydoros", "Zied Tayeb", - "Sepehr Maleki", "Junfeng Gao", "Tom Duckett", "Mini Rai", - "Amir Ghalamzan Esfahani" - ] - logger.info(f"Using default list of {len(authors_list)} authors") - else: - logger.info(f"Processing {len(authors_list)} authors from command line/file") - - authors = {} - all_articles = [] - df_all = None - - for author_name in authors_list: - logger.info(f"*** Processing {author_name}...") - - authors[author_name] = Author(author_name, debug=args.debug, rate_limit_delay=args.rate_limit_delay, max_retries=args.max_retries) - cache_exists = os.path.exists(f"{author_name}.db") - - if cache_exists and args.use_author_cache: - logger.info(f"Loading cached data for {author_name}") - authors[author_name].load() - else: - logger.info(f"Retrieving data for {author_name}") - authors[author_name].retrieve(args.use_author_cache) - authors[author_name].save() - - if authors[author_name].df is not None: - if df_all is None: - df_all = authors[author_name].df - else: - df_all = pd.concat([df_all, authors[author_name].df]) - all_articles.extend(authors[author_name].articles) - - authors[author_name].df.to_csv(f"{author_name}.csv", index=False, encoding='utf-8') - bibtex_filename = f"{author_name}.bib" - bibtex = BibDatabase() - bibtex.entries = [entry for entry in authors[author_name].df['bibtex'].tolist() if isinstance(entry, dict)] - # Process all entries in the bibtex database and remove any duplicates based on ID - unique_entries = {} - for entry in authors[author_name].df['bibtex'].tolist(): - if isinstance(entry, dict) and 'ID' in entry: - # Use ID as the key to avoid duplicates - unique_entries[entry['ID']] = entry - elif entry is not None: - logger.debug(f"Skipping entry without ID: {entry}") - - logger.info(f"Reduced from {len(authors[author_name].df['bibtex'].dropna())} to {len(unique_entries)} unique bibtex entries") - # Replace the entries with the unique ones - bibtex.entries = list(unique_entries.values()) - with open(bibtex_filename, 'w') as f: - f.write(bibtexparser.dumps(bibtex)) - logger.info(f"Saved bibtex entries to {bibtex_filename}") - - else: - logger.warning(f"No data found for {author_name}") - - if df_all is None or len(df_all) == 0: - logger.error("No publication data found. Exiting.") - return - - logger.info(f"Total number of articles before deduplication: {len(df_all)}") - - # Group by ID and aggregate authors into lists - grouped = df_all.groupby('id').agg({ - 'author': lambda x: list(set(x)) # Use set to remove duplicate authors - }) - - # Filter the original dataframe to keep only one row per ID - deduplicated_df = df_all.drop_duplicates(subset=['id'], keep='first') - - # Add the aggregated authors list as a new column - deduplicated_df = deduplicated_df.set_index('id') - deduplicated_df['authors'] = grouped['author'] - deduplicated_df = deduplicated_df.reset_index() - - # Convert authors list to comma-separated string - deduplicated_df['authors'] = deduplicated_df['authors'].apply(lambda authors: ', '.join(authors)) - - logger.info(f"Total number of articles after deduplication: {len(deduplicated_df)}") - - # export bibtex file - bibtex_filename = "lcas.bib" - # with open(bibtex_filename, 'w') as f: - # for index, row in deduplicated_df.iterrows(): - # if 'bibtex_str' in row and isinstance(row['bibtex_str'], str): - # f.write(row['bibtex_str']) - # f.write("\n\n") - # logger.info(f"Saved bibtex entries to {bibtex_filename}") - bibtex = BibDatabase() - bibtex.entries = [entry for entry in deduplicated_df['bibtex'].tolist() if isinstance(entry, dict)] - # Process all entries in the bibtex database and remove any duplicates based on ID - unique_entries = {} - for entry in bibtex.entries: - if entry and 'ID' in entry: - # Use ID as the key to avoid duplicates - unique_entries[entry['ID']] = entry - else: - logger.debug(f"Skipping entry without ID: {entry}") - - logger.info(f"Reduced from {len(bibtex.entries)} to {len(unique_entries)} unique bibtex entries") - # Replace the entries with the unique ones - bibtex.entries = list(unique_entries.values()) - with open(bibtex_filename, 'w') as f: - f.write(bibtexparser.dumps(bibtex)) - logger.info(f"Saved bibtex entries to {bibtex_filename}") - - # Save all data to CSV - deduplicated_df.to_csv(args.output, index=False, encoding='utf-8') - logger.info(f"Saved deduplicated articles to {args.output}") - - # Save all data to CSV - df_all.to_csv(args.output_all, index=False, encoding='utf-8') - logger.info(f"Saved all articles to {args.output_all}") - - # # Parse the since date - # try: - # since_date = pd.Timestamp(datetime.strptime(args.since, '%Y-%m-%d')).tz_localize('UTC') - # filtered_df = deduplicated_df[deduplicated_df['online_date'] > since_date] - # filtered_df.to_csv(args.recent_output, index=False, encoding='utf-8') - # logger.info(f"Saved {len(filtered_df)} articles since {args.since} to {args.recent_output}") - # except ValueError as e: - # logger.error(f"Invalid date format: {e}. Expected YYYY-MM-DD.") - - logger.info("Processing complete") - -if __name__ == "__main__": - figshare_processing() diff --git a/figshare_fetch.py b/figshare_fetch.py index 37c5c56..7dfa00c 100755 --- a/figshare_fetch.py +++ b/figshare_fetch.py @@ -97,7 +97,10 @@ def figshare_fetch(): authors = {} df_all = None + authors_to_process = [] # Track authors that need detail retrieval + # First pass: Initialize authors and retrieve basic figshare data + logger.info("=== Phase 1: Retrieving basic article data from Figshare ===") for author_name in authors_list: logger.info(f"*** Processing {author_name}...") @@ -108,17 +111,26 @@ def figshare_fetch(): logger.info(f"Loading cached data for {author_name}") authors[author_name].load() else: - logger.info(f"Retrieving data for {author_name}") - # Call retrieve WITHOUT bibtex generation + logger.info(f"Retrieving basic data for {author_name}") authors[author_name]._retrieve_figshare(use_cache=args.use_author_cache) authors[author_name]._remove_non_repository() + authors_to_process.append(author_name) + + # Second pass: Retrieve details for all articles (this is where rate limiting matters) + if authors_to_process: + logger.info("=== Phase 2: Retrieving detailed article information ===") + for author_name in authors_to_process: + logger.info(f"*** Retrieving details for {author_name}...") authors[author_name]._retrieve_details(use_cache=True) authors[author_name]._custom_fields_to_dicts() authors[author_name]._flatten() authors[author_name]._create_dataframe() # Note: NOT calling _retrieve_bibtex_from_dois() here - that's for script 2 authors[author_name].save() - + + # Third pass: Aggregate dataframes and save individual CSVs + logger.info("=== Phase 3: Aggregating and saving results ===") + for author_name in authors_list: if authors[author_name].df is not None: if df_all is None: df_all = authors[author_name].df