|
| 1 | +#!/usr/bin/env python |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +import pandas as pd |
| 5 | +import shelve |
| 6 | +import re |
| 7 | +from logging import getLogger, INFO, DEBUG |
| 8 | +from flatten_dict import flatten |
| 9 | + |
| 10 | +from figshare_api import FigShare |
| 11 | +from doi2bib import doi2bib |
| 12 | +from doi_utils import guess_doi_from_crossref |
| 13 | + |
| 14 | + |
| 15 | +class Author: |
| 16 | + def __init__(self, name, debug=False, rate_limit_delay=1.0, max_retries=5): |
| 17 | + self.logger = getLogger("Author") |
| 18 | + if debug: |
| 19 | + self.logger.setLevel(DEBUG) |
| 20 | + self.name = name |
| 21 | + self.fs = FigShare(rate_limit_delay=rate_limit_delay, max_retries=max_retries) |
| 22 | + self.articles = {} |
| 23 | + self.public_html_prefix = "https://repository.lincoln.ac.uk" |
| 24 | + self.df = None |
| 25 | + |
| 26 | + def save(self, filename=None): |
| 27 | + if filename is None: |
| 28 | + filename = f"{self.name}.db" |
| 29 | + with shelve.open(filename) as db: |
| 30 | + db['articles'] = self.articles |
| 31 | + db['df'] = self.df |
| 32 | + |
| 33 | + def load(self, filename=None): |
| 34 | + if filename is None: |
| 35 | + filename = f"{self.name}.db" |
| 36 | + with shelve.open(filename) as db: |
| 37 | + self.articles = db['articles'] |
| 38 | + self.df = db['df'] |
| 39 | + |
| 40 | + |
| 41 | + def _retrieve_figshare(self, use_cache=True): |
| 42 | + self.logger.info(f"retrieving articles for {self.name}") |
| 43 | + self.articles = self.fs.articles_by_user_name(self.name, use_cache=use_cache) |
| 44 | + |
| 45 | + self.logger.info(f"found {len(self.articles)} articles for {self.name}") |
| 46 | + |
| 47 | + def _retrieve_details(self, use_cache=True): |
| 48 | + for article in self.articles: |
| 49 | + self.logger.info(f"retrieving details for article {article['id']}") |
| 50 | + article['details'] = self.fs.get_article(article['id'], use_cache=use_cache) |
| 51 | + |
| 52 | + def _remove_non_repository(self): |
| 53 | + self.logger.info(f"removing non-repository articles out of {len(self.articles)}") |
| 54 | + self.articles = [a for a in self.articles if a['url_public_html'].startswith(self.public_html_prefix)] |
| 55 | + self.logger.info(f"retained {len(self.articles)} articles") |
| 56 | + |
| 57 | + def _custom_fields_to_dicts(self): |
| 58 | + for article in self.articles: |
| 59 | + if 'details' not in article: |
| 60 | + continue |
| 61 | + if 'custom_fields' not in article['details']: |
| 62 | + continue |
| 63 | + self.logger.debug(f"convert") |
| 64 | + |
| 65 | + cf = article['details']['custom_fields'] |
| 66 | + if type(cf) == list: |
| 67 | + new_cf = {} |
| 68 | + for p in cf: |
| 69 | + new_cf[p['name']] = p['value'] |
| 70 | + article['details']['custom_fields'] = new_cf |
| 71 | + |
| 72 | + |
| 73 | + def _guess_doi(self, article): |
| 74 | + """ |
| 75 | + Use crossref API to guess the DOI for an article based on the title and authors |
| 76 | + """ |
| 77 | + if 'title' not in article or not article['title']: |
| 78 | + self.logger.warning("No title found for article, can't guess DOI") |
| 79 | + return None |
| 80 | + |
| 81 | + title = article['title'] |
| 82 | + author = article['author'] |
| 83 | + |
| 84 | + return guess_doi_from_crossref(title, author) |
| 85 | + |
| 86 | + |
| 87 | + def _retrieve_bibtex_from_dois(self): |
| 88 | + if self.df is None: |
| 89 | + self.logger.warning(f"no dataframe found for {self.name}, can't continue") |
| 90 | + return |
| 91 | + doi2bibber = doi2bib() |
| 92 | + # iterate over all rows in the dataframe self.df |
| 93 | + for index, row in self.df.iterrows(): |
| 94 | + doi = row['External DOI'] |
| 95 | + # Check if DOI is in valid format |
| 96 | + if doi and isinstance(doi, str): |
| 97 | + # Basic DOI validation - should start with 10. followed by numbers/dots/hyphens |
| 98 | + if not doi.startswith('10.') or not len(doi.split('/', 1)) == 2: |
| 99 | + self.logger.warning(f"Invalid DOI format: {doi}, will try to guess") |
| 100 | + doi = None |
| 101 | + else: |
| 102 | + self.logger.info(f"No DOI defined in record for article, will try to guess") |
| 103 | + doi = None |
| 104 | + if doi is None: |
| 105 | + doi = self._guess_doi(row) |
| 106 | + if doi is None: |
| 107 | + self.logger.debug(f"Unable to guess DOI for article, no option left but to skip it") |
| 108 | + continue |
| 109 | + self.logger.info(f"Guessed DOI for article: {doi}, updating dataframe") |
| 110 | + self.df.at[index, 'External DOI'] = doi |
| 111 | + try: |
| 112 | + bibtex = doi2bibber.get_bibtex_entry(doi) |
| 113 | + # Update the dataframe with the bibtex information |
| 114 | + if bibtex is not None: |
| 115 | + self.df.at[index, 'bibtex'] = bibtex |
| 116 | + self.df.at[index, 'bibtex_str'] = doi2bibber.entries_to_str([bibtex]) |
| 117 | + self.logger.info(f"got bibtex for {doi}") |
| 118 | + else: |
| 119 | + self.logger.warning(f"Couldn't get bibtex for {doi}") |
| 120 | + |
| 121 | + except Exception as e: |
| 122 | + self.logger.warning(f"Failed to get bibtex for {doi}: {e}") |
| 123 | + |
| 124 | + def _flatten(self): |
| 125 | + new_articles = [] |
| 126 | + for a in self.articles: |
| 127 | + new_articles.append(flatten(a, reducer='path')) |
| 128 | + self.articles = new_articles |
| 129 | + |
| 130 | + def retrieve(self, use_cache=True): |
| 131 | + self._retrieve_figshare(use_cache=use_cache) |
| 132 | + self._remove_non_repository() |
| 133 | + self._retrieve_details(use_cache=True) |
| 134 | + self._custom_fields_to_dicts() |
| 135 | + self._flatten() |
| 136 | + self._create_dataframe() |
| 137 | + self._retrieve_bibtex_from_dois() |
| 138 | + |
| 139 | + def _create_dataframe(self): |
| 140 | + if len(self.articles) == 0: |
| 141 | + self.logger.warning(f"no articles found for {self.name}, can't create dataframe") |
| 142 | + self.df = None |
| 143 | + return |
| 144 | + self.df = pd.DataFrame.from_dict(self.articles) |
| 145 | + # add column with author name |
| 146 | + self.df['author'] = self.name |
| 147 | + # add column with online date (as datetime object) |
| 148 | + self.df['online_date'] = pd.to_datetime(self.df['timeline/firstOnline'], utc=True) |
| 149 | + # add column with online year |
| 150 | + self.df['online_year'] = self.df['online_date'].apply( |
| 151 | + lambda x: x.year |
| 152 | + ) |
| 153 | + # add column with external DOI, parsed from custom_fields |
| 154 | + self.df['External DOI'] = self.df['details/custom_fields/External DOI'].apply( |
| 155 | + lambda x: re.sub(r'^(?:https?://doi\.org/|doi:)', '', x[0], flags=re.IGNORECASE).replace('doi:','') |
| 156 | + if isinstance(x, list) and len(x) > 0 else None |
| 157 | + ) |
| 158 | + |
| 159 | + |
| 160 | + |
| 161 | + return self.df |
0 commit comments