From c8a474ef594f1827a9e4e564a735a8c4f1755a89 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Sun, 10 Dec 2017 21:25:19 -0800 Subject: [PATCH 1/7] add partal dois regex & class method to make internal PLOS server processes easier for checking articles likely to have updated XML, adds `from_partial_doi` class method & `find_valid_partial_dois` regex. --- allofplos/article.py | 18 +++++++++++++++++- allofplos/plos_regex.py | 11 +++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/allofplos/article.py b/allofplos/article.py index bad63c3b..425a5739 100644 --- a/allofplos/article.py +++ b/allofplos/article.py @@ -10,7 +10,7 @@ from . import get_corpus_dir from .transformations import (filename_to_doi, _get_base_page, LANDING_PAGE_SUFFIX, URL_SUFFIX, plos_page_dict, doi_url, doi_to_url, doi_to_path) -from .plos_regex import validate_doi +from .plos_regex import validate_doi, find_valid_partial_dois from .elements import (parse_article_date, get_contrib_info, Journal, License, match_contribs_to_dicts) from .utils import dedent @@ -1336,3 +1336,19 @@ def from_filename(cls, filename): else: directory = None return cls(filename_to_doi(filename), directory=directory) + + @classmethod + def from_partial_doi(cls, partial_doi): + """Initiate an article object using a partial DOI. + Uses regex to make sure it's a valid partial DOI. + Used for internal PLOS methods. + """ + doi = '' + if len(find_valid_partial_dois(partial_doi)) == 1: + if 'annotation' in partial_doi: + doi = '10.1371/' + partial_doi + else: + doi = '10.1371/journal.' + partial_doi + else: + print(find_valid_partial_dois(partial_doi)) + return cls(doi) diff --git a/allofplos/plos_regex.py b/allofplos/plos_regex.py index 0939d613..b37b563a 100644 --- a/allofplos/plos_regex.py +++ b/allofplos/plos_regex.py @@ -22,6 +22,8 @@ full_doi_regex_match = re.compile(regex_match_prefix+regex_body_match) full_doi_regex_search = re.compile(r"10\.1371/journal\.p[a-zA-Z]{3}\.[\d]{7}" "|10\.1371/annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}") +partial_doi_regex_search = re.compile(r"p[a-zA-Z]{3}\.[\d]{7}" + "|annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}") currents_doi_regex = re.compile(regex_match_prefix+regex_body_currents) file_regex_match = re.compile(regex_file_search+r"\.xml") BASE_URL = 'http://journals.plos.org/plosone/article/file?id=' @@ -75,6 +77,15 @@ def find_valid_dois(doi): return full_doi_regex_search.findall(doi) +def find_valid_partial_dois(doi): + """ + For an individual string, searches for any valid partial PLOS DOIs within it and returns them + Uses for finding DOIs in PLOS job tickets + :return: list of valid PLOS partial DOIs contained within string + """ + return partial_doi_regex_search.findall(doi) + + def show_invalid_dois(doi_list): """ Checks to see whether a list of PLOS DOIs follow the correct format. Used mainly to determine From dee96f46bc704399236f9c055271cc76f4374478 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Thu, 15 Feb 2018 12:25:55 -0800 Subject: [PATCH 2/7] add partial_doi property --- allofplos/article.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/allofplos/article.py b/allofplos/article.py index 425a5739..bbc0a40f 100644 --- a/allofplos/article.py +++ b/allofplos/article.py @@ -124,6 +124,11 @@ def doi(self): """ return self._doi + @property + def partial_doi(self): + """Convert a DOI to a partial DOI.""" + return self.doi.lstrip('10.1371/').replace('journal.', '') + @property def text_viewer(self): """Command line application for viewing text to be used with From 8ca7b2079f207b98bccd6e2c3137fc67d8c07bd4 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Thu, 15 Mar 2018 15:16:05 -0700 Subject: [PATCH 3/7] add validate_partial_doi to regex --- allofplos/plos_regex.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/allofplos/plos_regex.py b/allofplos/plos_regex.py index b37b563a..9a586764 100644 --- a/allofplos/plos_regex.py +++ b/allofplos/plos_regex.py @@ -24,6 +24,8 @@ "|10\.1371/annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}") partial_doi_regex_search = re.compile(r"p[a-zA-Z]{3}\.[\d]{7}" "|annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}") +partial_doi_regex_match = re.compile(r"^p[a-zA-Z]{3}\.[\d]{7}$" + r"|^annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}$") currents_doi_regex = re.compile(regex_match_prefix+regex_body_currents) file_regex_match = re.compile(regex_file_search+r"\.xml") BASE_URL = 'http://journals.plos.org/plosone/article/file?id=' @@ -43,6 +45,14 @@ def validate_doi(doi): return bool(full_doi_regex_match.search(doi)) +def validate_partial_doi(partial_doi): + """For an individual string, tests whether the full string is in a valid PLOS partial DOI format. + Example: 'pbio.2000777' is True, but '10.1371/journal.pbio.2000777' is False + :return: True if string is in valid PLOS partial DOI format; False if not + """ + return bool(partial_doi_regex_match.search(partial_doi)) + + def validate_filename(filename): """ For an individual string, tests whether the full string is in a valid article file. This can take two forms. @@ -80,7 +90,7 @@ def find_valid_dois(doi): def find_valid_partial_dois(doi): """ For an individual string, searches for any valid partial PLOS DOIs within it and returns them - Uses for finding DOIs in PLOS job tickets + Used for finding DOIs in PLOS job tickets :return: list of valid PLOS partial DOIs contained within string """ return partial_doi_regex_search.findall(doi) From 30ee36447b6480452fa8a428bc08f76a2e1037b8 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Thu, 15 Mar 2018 15:33:29 -0700 Subject: [PATCH 4/7] add two-way doi <-> partial_doi conversions --- allofplos/transformations.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/allofplos/transformations.py b/allofplos/transformations.py index 38de9608..c116e765 100644 --- a/allofplos/transformations.py +++ b/allofplos/transformations.py @@ -5,7 +5,7 @@ from . import get_corpus_dir -from .plos_regex import validate_filename, validate_doi +from .plos_regex import validate_filename, validate_doi, validate_partial_doi from .elements import Journal # URL bases for PLOS's Solr instances, that index PLOS articles @@ -183,6 +183,22 @@ def doi_to_path(doi, directory=None): article_file = os.path.join(directory, doi.lstrip(PREFIX) + SUFFIX_LOWER) return article_file +def partial_to_doi(partial_doi): + """Convert a partial DOI into a DOI.""" + if validate_partial_doi(partial_doi) is False: + raise Exception("Invalid format for PLOS partial DOI: {}".format(partial_doi)) + if partial_doi.startswith('annotation'): + doi = PREFIX + partial_doi + else: + doi = ''.join([PREFIX, 'journal.', partial_doi]) + return doi + +def doi_to_partial(doi): + """Convert a DOI into a partial DOI.""" + if validate_doi(doi) is False: + raise Exception("Invalid format for PLOS DOI: {}".format(doi)) + return doi.lstrip('10.1371/').replace('journal.', '') + def convert_country(country): """ From 322e7d0d58abeb71b9d162d7709cf841922d6397 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Thu, 15 Mar 2018 15:33:48 -0700 Subject: [PATCH 5/7] update from_partial_doi class method --- allofplos/article.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/allofplos/article.py b/allofplos/article.py index bbc0a40f..264563ca 100644 --- a/allofplos/article.py +++ b/allofplos/article.py @@ -9,7 +9,8 @@ from . import get_corpus_dir from .transformations import (filename_to_doi, _get_base_page, LANDING_PAGE_SUFFIX, - URL_SUFFIX, plos_page_dict, doi_url, doi_to_url, doi_to_path) + URL_SUFFIX, plos_page_dict, doi_url, doi_to_url, doi_to_path, + partial_to_doi) from .plos_regex import validate_doi, find_valid_partial_dois from .elements import (parse_article_date, get_contrib_info, Journal, License, match_contribs_to_dicts) @@ -1343,17 +1344,13 @@ def from_filename(cls, filename): return cls(filename_to_doi(filename), directory=directory) @classmethod - def from_partial_doi(cls, partial_doi): + def from_partial_doi(cls, partial_doi, directory=None): """Initiate an article object using a partial DOI. Uses regex to make sure it's a valid partial DOI. Used for internal PLOS methods. """ - doi = '' - if len(find_valid_partial_dois(partial_doi)) == 1: - if 'annotation' in partial_doi: - doi = '10.1371/' + partial_doi - else: - doi = '10.1371/journal.' + partial_doi - else: - print(find_valid_partial_dois(partial_doi)) - return cls(doi) + if directory is None: + directory = get_corpus_dir() + doi = partial_to_doi(partial_doi) + + return cls(doi, directory=directory) From 2148ccf61bc6d4efd6292e756d665f22d0ff3423 Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Thu, 15 Mar 2018 17:40:16 -0700 Subject: [PATCH 6/7] update Corpus() to take partial doi The same as it can initialize an Article object from DOI, it can do that from a partial DOI as well. --- allofplos/corpus/corpus.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/allofplos/corpus/corpus.py b/allofplos/corpus/corpus.py index fc176ef3..dbd3ec90 100644 --- a/allofplos/corpus/corpus.py +++ b/allofplos/corpus/corpus.py @@ -5,7 +5,8 @@ from itertools import islice from .. import get_corpus_dir, Article -from ..transformations import filename_to_doi, doi_to_path +from ..transformations import filename_to_doi, doi_to_path, partial_to_doi +from ..plos_regex import validate_doi, validate_partial_doi class Corpus: @@ -36,13 +37,16 @@ def __iter__(self): return (article for article in self.random_article_generator) def __getitem__(self, key): - if isinstance(key, int): return Article(self.dois[key], directory=self.directory) elif isinstance(key, slice): return (Article(doi, directory=self.directory) for doi in self.dois[key]) elif key not in self.dois: + if partial_to_doi(key) in self.dois: + return Article.from_partial_doi(key, directory=self.directory) + elif validate_partial_doi(key): + key = partial_to_doi(key) path= doi_to_path(key, directory=self.directory) raise IndexError(("You attempted get {doi} from " "the corpus at \n{directory}. \n" From ee717458dda2a360ab26f9e48d86ea6ad981de9a Mon Sep 17 00:00:00 2001 From: Elizabeth Seiver Date: Mon, 19 Mar 2018 11:47:33 -0700 Subject: [PATCH 7/7] add tests for partial dois --- allofplos/tests/test_partial_dois.py | 52 ++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 allofplos/tests/test_partial_dois.py diff --git a/allofplos/tests/test_partial_dois.py b/allofplos/tests/test_partial_dois.py new file mode 100644 index 00000000..b750fd2c --- /dev/null +++ b/allofplos/tests/test_partial_dois.py @@ -0,0 +1,52 @@ +from .. import Corpus, Article, starterdir +from ..plos_regex import validate_partial_doi, validate_doi +from ..transformations import partial_to_doi, doi_to_partial + +import pytest + + +@pytest.fixture +def corpus(): + return Corpus(starterdir, seed=1000) + + +@pytest.fixture +def test_article(): + return Article('10.1371/journal.pone.0040259', directory=starterdir) + + +@pytest.fixture +def test_doi(): + return '10.1371/journal.pone.0040259' + + +@pytest.fixture +def test_partial_doi(): + return 'pone.0040259' + + +def test_partial_doi_regex(test_partial_doi): + assert validate_partial_doi(test_partial_doi) + assert not validate_partial_doi(' pone.0040259') + assert not validate_partial_doi('pone.0040259 ') + + +def test_partial_doi_transform(test_doi, test_partial_doi): + partial_doi = doi_to_partial(test_doi) + assert partial_doi == test_partial_doi + + +def test_doi_transform(test_partial_doi, test_doi): + doi = partial_to_doi(test_partial_doi) + assert validate_doi(doi) + assert doi == test_doi + + +def test_partial_doi_method_article(test_partial_doi, test_article): + article = Article.from_partial_doi(test_partial_doi, directory=starterdir) + assert article == test_article + + +def test_partial_doi_method_corpus(corpus, test_article, test_partial_doi): + article = corpus[test_partial_doi] + assert article == test_article