+ Enter a DOI to build an interactive citation network. The graph shows
+ related papers based on bibliographic coupling, co-citation, and direct citations.
+
+
+
+
+
+
+
+
+
+
+
+
Building citation network... This may take a moment.
+
+
+
+
An error occurred
+
+
+
+
+
+
+
+ Citation Network
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Related Papers
+
+
+
+
+
+
+
+
+
+
+
Library
+
Zotero-style paper management. Coming in Phase 2.
+
+
+
+
+
+
+
+
Search
+
Search across CrossRef, OpenAlex, and your local library. Coming in Phase 3.
+
+
+
+
+
+
+
+
Metadata Enrichment
+
Enrich BibTeX with abstracts, citations, and impact factors. Coming in Phase 4.
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/scitex/scholar/integration/zotero/__init__.py b/src/scitex/scholar/integration/zotero/__init__.py
index 820f3c837..8cde4b794 100755
--- a/src/scitex/scholar/integration/zotero/__init__.py
+++ b/src/scitex/scholar/integration/zotero/__init__.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
"""
Zotero integration for SciTeX Scholar module.
@@ -17,11 +16,14 @@
from .exporter import ZoteroExporter
from .importer import ZoteroImporter
from .linker import ZoteroLinker
+from .local_reader import ZoteroLocalReader, export_for_zotero
from .mapper import ZoteroMapper
__all__ = [
"ZoteroImporter",
"ZoteroExporter",
"ZoteroLinker",
+ "ZoteroLocalReader",
"ZoteroMapper",
+ "export_for_zotero",
]
diff --git a/src/scitex/scholar/integration/zotero/local_reader.py b/src/scitex/scholar/integration/zotero/local_reader.py
new file mode 100755
index 000000000..f1b58e1ad
--- /dev/null
+++ b/src/scitex/scholar/integration/zotero/local_reader.py
@@ -0,0 +1,397 @@
+#!/usr/bin/env python3
+"""
+Zotero local SQLite reader — no API key required.
+
+Reads directly from Zotero's local database file (zotero.sqlite).
+Auto-detects Linux and Windows (WSL) Zotero installations.
+
+Usage:
+ from scitex.scholar.integration.zotero import ZoteroLocalReader, export_for_zotero
+
+ reader = ZoteroLocalReader() # auto-detect
+ papers = reader.read_all() # all items
+ papers = reader.read_by_tags(["EEG"]) # filter by tag
+ export_for_zotero(papers, "out.bib") # export for Zotero > File > Import
+"""
+
+from __future__ import annotations
+
+import sqlite3
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from scitex.scholar.core.Papers import Papers
+
+from .mapper import ZoteroMapper
+
+# ── Known Zotero DB paths ─────────────────────────────────────────────────────
+
+_LINUX_PATH = Path("~/Zotero/zotero.sqlite").expanduser()
+_WSL_BASE = Path("/mnt/c/Users")
+
+_SKIP_TYPES = {"attachment", "note", "annotation"}
+
+
+# ── Reader ────────────────────────────────────────────────────────────────────
+
+
+class ZoteroLocalReader:
+ """Read papers from a local Zotero SQLite database.
+
+ Parameters
+ ----------
+ db_path : str or Path, optional
+ Path to zotero.sqlite. If None, auto-detects Linux then WSL paths.
+ project : str
+ Scholar project name for the returned Papers collection.
+ """
+
+ def __init__(
+ self,
+ db_path: Optional[str | Path] = None,
+ project: str = "default",
+ ):
+ self.db_path = Path(db_path) if db_path else self._detect_db_path()
+ self.project = project
+ self._mapper = ZoteroMapper()
+
+ # ── Public methods ────────────────────────────────────────────────────────
+
+ def read_all(self, limit: Optional[int] = None) -> Papers:
+ """Read all non-attachment items from the Zotero library.
+
+ Parameters
+ ----------
+ limit : int, optional
+ Maximum number of items to return.
+
+ Returns
+ -------
+ Papers
+ """
+ item_ids = self._fetch_item_ids(limit=limit)
+ return self._build_papers(item_ids)
+
+ def read_by_collection(self, name: str) -> Papers:
+ """Read items belonging to a Zotero collection.
+
+ Parameters
+ ----------
+ name : str
+ Collection name (case-sensitive).
+
+ Returns
+ -------
+ Papers
+ """
+ with self._connect() as conn:
+ rows = conn.execute(
+ """
+ SELECT ci.itemID
+ FROM collectionItems ci
+ JOIN collections col ON ci.collectionID = col.collectionID
+ WHERE col.collectionName = ?
+ """,
+ (name,),
+ ).fetchall()
+ item_ids = [r[0] for r in rows]
+ return self._build_papers(item_ids)
+
+ def read_by_tags(self, tags: List[str], match_all: bool = False) -> Papers:
+ """Read items matching given tags.
+
+ Parameters
+ ----------
+ tags : list of str
+ Tag names to filter by.
+ match_all : bool
+ If True, items must have ALL listed tags.
+ If False (default), items with ANY listed tag are returned.
+
+ Returns
+ -------
+ Papers
+ """
+ placeholders = ",".join("?" * len(tags))
+ with self._connect() as conn:
+ rows = conn.execute(
+ f"""
+ SELECT it.itemID, COUNT(DISTINCT t.name) as tag_count
+ FROM itemTags it
+ JOIN tags t ON it.tagID = t.tagID
+ WHERE t.name IN ({placeholders})
+ GROUP BY it.itemID
+ """,
+ tags,
+ ).fetchall()
+
+ required = len(tags) if match_all else 1
+ item_ids = [r[0] for r in rows if r[1] >= required]
+ return self._build_papers(item_ids)
+
+ def list_collections(self) -> List[str]:
+ """Return all collection names in the Zotero library, sorted alphabetically.
+
+ Returns
+ -------
+ list of str
+ Collection names sorted alphabetically.
+ """
+ with self._connect() as conn:
+ rows = conn.execute(
+ "SELECT collectionName FROM collections ORDER BY collectionName"
+ ).fetchall()
+ return [r[0] for r in rows]
+
+ def list_tags(self) -> List[Dict]:
+ """Return all tag names with occurrence counts, sorted by count descending.
+
+ Returns
+ -------
+ list of dict
+ Tags with structure: [{"name": str, "count": int}, ...] sorted by count (descending).
+ """
+ with self._connect() as conn:
+ rows = conn.execute(
+ "SELECT name, COUNT(*) as cnt FROM tags GROUP BY name ORDER BY cnt DESC"
+ ).fetchall()
+ return [{"name": r[0], "count": r[1]} for r in rows]
+
+ # ── Internal helpers ──────────────────────────────────────────────────────
+
+ def _detect_db_path(self) -> Path:
+ """Auto-detect Zotero SQLite: Linux first, then WSL Windows mount."""
+ if _LINUX_PATH.exists():
+ return _LINUX_PATH
+ if _WSL_BASE.exists():
+ for candidate in _WSL_BASE.glob("*/Zotero/zotero.sqlite"):
+ if candidate.exists():
+ return candidate
+ raise FileNotFoundError(
+ "No Zotero database found. Checked:\n"
+ f" {_LINUX_PATH}\n"
+ f" {_WSL_BASE}/*/Zotero/zotero.sqlite\n"
+ "Pass db_path explicitly: ZoteroLocalReader(db_path='/path/to/zotero.sqlite')"
+ )
+
+ def _connect(self) -> sqlite3.Connection:
+ """Open a read-only SQLite connection."""
+ conn = sqlite3.connect(f"file:{self.db_path}?mode=ro", uri=True)
+ conn.row_factory = sqlite3.Row
+ return conn
+
+ def _fetch_item_ids(self, limit: Optional[int] = None) -> List[int]:
+ """Fetch IDs of all non-attachment, non-note items."""
+ skip = ",".join(f"'{t}'" for t in _SKIP_TYPES)
+ limit_clause = f"LIMIT {limit}" if limit else ""
+ with self._connect() as conn:
+ rows = conn.execute(
+ f"""
+ SELECT i.itemID
+ FROM items i
+ JOIN itemTypes it ON i.itemTypeID = it.itemTypeID
+ WHERE it.typeName NOT IN ({skip})
+ ORDER BY i.itemID
+ {limit_clause}
+ """
+ ).fetchall()
+ return [r[0] for r in rows]
+
+ def _build_papers(self, item_ids: List[int]) -> Papers:
+ """Batch-load all data for the given item IDs and convert to Papers."""
+ if not item_ids:
+ return Papers([], project=self.project)
+
+ ids_str = ",".join(str(i) for i in item_ids)
+
+ with self._connect() as conn:
+ # Item base info
+ type_rows = conn.execute(
+ f"""
+ SELECT i.itemID, i.key, it.typeName
+ FROM items i
+ JOIN itemTypes it ON i.itemTypeID = it.itemTypeID
+ WHERE i.itemID IN ({ids_str})
+ """
+ ).fetchall()
+
+ # All field values (batch)
+ field_rows = conn.execute(
+ f"""
+ SELECT id.itemID, f.fieldName, idv.value
+ FROM itemData id
+ JOIN fields f ON id.fieldID = f.fieldID
+ JOIN itemDataValues idv ON id.valueID = idv.valueID
+ WHERE id.itemID IN ({ids_str})
+ """
+ ).fetchall()
+
+ # Creators (ordered)
+ creator_rows = conn.execute(
+ f"""
+ SELECT ic.itemID, c.firstName, c.lastName, ct.creatorType
+ FROM itemCreators ic
+ JOIN creators c ON ic.creatorID = c.creatorID
+ JOIN creatorTypes ct ON ic.creatorTypeID = ct.creatorTypeID
+ WHERE ic.itemID IN ({ids_str})
+ ORDER BY ic.itemID, ic.orderIndex
+ """
+ ).fetchall()
+
+ # Tags
+ tag_rows = conn.execute(
+ f"""
+ SELECT it.itemID, t.name
+ FROM itemTags it
+ JOIN tags t ON it.tagID = t.tagID
+ WHERE it.itemID IN ({ids_str})
+ """
+ ).fetchall()
+
+ # Group by itemID
+ fields: Dict[int, Dict[str, str]] = {i: {} for i in item_ids}
+ for row in field_rows:
+ fields[row[0]][row[1]] = row[2]
+
+ creators: Dict[int, List[dict]] = {i: [] for i in item_ids}
+ for row in creator_rows:
+ creators[row[0]].append(
+ {
+ "firstName": row[1] or "",
+ "lastName": row[2] or "",
+ "creatorType": row[3],
+ }
+ )
+
+ tags: Dict[int, List[str]] = {i: [] for i in item_ids}
+ for row in tag_rows:
+ tags[row[0]].append(row[1])
+
+ # Convert to Papers via ZoteroMapper
+ paper_list = []
+ for row in type_rows:
+ item_id, key, type_name = row[0], row[1], row[2]
+ api_dict = self._to_api_format(
+ key,
+ type_name,
+ fields.get(item_id, {}),
+ creators.get(item_id, []),
+ tags.get(item_id, []),
+ )
+ try:
+ paper = self._mapper.zotero_to_paper(api_dict)
+ paper_list.append(paper)
+ except Exception:
+ pass # Skip malformed items silently
+
+ return Papers(paper_list, project=self.project)
+
+ def _to_api_format(
+ self,
+ key: str,
+ type_name: str,
+ fields: Dict[str, str],
+ creators: List[dict],
+ tags: List[str],
+ ) -> dict:
+ """Convert raw SQLite rows to the Zotero API dict format ZoteroMapper expects."""
+ return {
+ "key": key,
+ "version": 0,
+ "data": {
+ "itemType": type_name,
+ "title": fields.get("title", ""),
+ "abstractNote": fields.get("abstractNote", ""),
+ "creators": creators,
+ "date": fields.get("date", ""),
+ "DOI": fields.get("DOI", ""),
+ "url": fields.get("url", ""),
+ "publicationTitle": fields.get("publicationTitle", ""),
+ "journalAbbreviation": fields.get("journalAbbreviation", ""),
+ "volume": fields.get("volume", ""),
+ "issue": fields.get("issue", ""),
+ "pages": fields.get("pages", ""),
+ "publisher": fields.get("publisher", ""),
+ "ISSN": fields.get("ISSN", ""),
+ "ISBN": fields.get("ISBN", ""),
+ "extra": fields.get("extra", ""),
+ "language": fields.get("language", ""),
+ "tags": [{"tag": t} for t in tags],
+ "collections": [],
+ },
+ }
+
+
+# ── Convenience export ────────────────────────────────────────────────────────
+
+
+def export_for_zotero(papers: Papers, path: str | Path, fmt: str = "bibtex") -> Path:
+ """Export papers to a file that Zotero can import via File > Import.
+
+ Parameters
+ ----------
+ papers : Papers
+ Papers collection to export.
+ path : str or Path
+ Output file path (e.g. 'output.bib', 'output.ris').
+ fmt : str
+ Format: 'bibtex' (default) or 'ris'.
+
+ Returns
+ -------
+ Path
+ The written file path.
+
+ Example
+ -------
+ >>> reader = ZoteroLocalReader()
+ >>> papers = reader.read_all()
+ >>> export_for_zotero(papers, "enriched.bib")
+ >>> # Then: Zotero > File > Import > enriched.bib
+ """
+ from scitex.scholar.formatting import papers_to_format
+
+ path = Path(path)
+
+ # Convert Papers (which may hold Paper objects) to plain dicts for formatting
+ paper_dicts = []
+ for p in papers:
+ if hasattr(p, "metadata"):
+ # Paper object — convert to formatting dict
+ paper_dicts.append(_paper_obj_to_dict(p))
+ elif isinstance(p, dict):
+ paper_dicts.append(p)
+
+ content = papers_to_format(paper_dicts, fmt)
+ path.write_text(content, encoding="utf-8")
+ return path
+
+
+def _paper_obj_to_dict(paper) -> dict:
+ """Convert Paper object to the plain dict format used by formatting.py."""
+ m = paper.metadata
+ authors_list = getattr(m.basic, "authors", []) or []
+ return {
+ "title": getattr(m.basic, "title", "") or "",
+ "authors_str": " and ".join(authors_list),
+ "year": str(getattr(m.basic, "year", "") or ""),
+ "abstract": getattr(m.basic, "abstract", "") or "",
+ "journal": getattr(m.publication, "journal", "") or "",
+ "volume": getattr(m.publication, "volume", "") or "",
+ "number": getattr(m.publication, "issue", "") or "",
+ "pages": getattr(m.publication, "pages", "") or "",
+ "doi": getattr(m.id, "doi", "") or "",
+ "pmid": getattr(m.id, "pmid", "") or "",
+ "arxiv_id": getattr(m.id, "arxiv_id", "") or "",
+ "url": (getattr(m.url, "publisher", "") or getattr(m.url, "doi", "") or ""),
+ "document_type": getattr(m.basic, "type", "article") or "article",
+ "is_open_access": False,
+ "source": "zotero",
+ }
+
+
+# ── Public API ────────────────────────────────────────────────────────────────
+
+__all__ = ["ZoteroLocalReader", "export_for_zotero"]
+
+# EOF
diff --git a/src/scitex/scholar/metadata_engines/utils/__init__.py b/src/scitex/scholar/metadata_engines/utils/__init__.py
index 376b61418..af01a431c 100755
--- a/src/scitex/scholar/metadata_engines/utils/__init__.py
+++ b/src/scitex/scholar/metadata_engines/utils/__init__.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
# Timestamp: "2025-08-04 08:15:00 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex_repo/src/scitex/scholar/doi/utils/__init__.py
# ----------------------------------------
@@ -12,7 +11,7 @@
# ----------------------------------------
# Import TextNormalizer from central location
-from scitex.scholar.utils.text import TextNormalizer
+from scitex.scholar._utils.text import TextNormalizer
from ._metadata2bibtex import metadata2bibtex
from ._PubMedConverter import PubMedConverter, pmid2doi
diff --git a/src/scitex/scholar/pipelines/SearchQueryParser.py b/src/scitex/scholar/pipelines/SearchQueryParser.py
index e3d96b706..e2617116a 100755
--- a/src/scitex/scholar/pipelines/SearchQueryParser.py
+++ b/src/scitex/scholar/pipelines/SearchQueryParser.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
# File: ./src/scitex/scholar/pipelines/SearchQueryParser.py
"""
@@ -53,6 +52,12 @@ def __init__(self, query: str):
self.max_citations: Optional[int] = None
self.open_access: Optional[bool] = None
self.document_type: Optional[str] = None
+ self.title_includes: List[str] = []
+ self.title_excludes: List[str] = []
+ self.author_includes: List[str] = []
+ self.author_excludes: List[str] = []
+ self.journal_includes: List[str] = []
+ self.journal_excludes: List[str] = []
self._parse()
@@ -161,9 +166,134 @@ def get_filters(self) -> Dict[str, Any]:
filters["open_access"] = self.open_access
if self.document_type is not None:
filters["document_type"] = self.document_type
+ if self.title_includes:
+ filters["title_includes"] = self.title_includes
+ if self.title_excludes:
+ filters["title_excludes"] = self.title_excludes
+ if self.author_includes:
+ filters["author_includes"] = self.author_includes
+ if self.author_excludes:
+ filters["author_excludes"] = self.author_excludes
+ if self.journal_includes:
+ filters["journal_includes"] = self.journal_includes
+ if self.journal_excludes:
+ filters["journal_excludes"] = self.journal_excludes
return filters
+ @classmethod
+ def from_shell_syntax(cls, query: str) -> "SearchQueryParser":
+ """Parse shell-style operators from a query string.
+
+ Supports the following shell-style operators:
+ -t VALUE or --title VALUE : Title include filter
+ -t -VALUE : Title exclude filter (- prefix on value)
+ -a VALUE or --author VALUE: Author include filter
+ -a -VALUE : Author exclude filter
+ -j VALUE or --journal VALUE: Journal include filter
+ -j -VALUE : Journal exclude filter
+ -ymin YYYY or --year-min YYYY: Minimum year
+ -ymax YYYY or --year-max YYYY: Maximum year
+ -cmin N or --citations-min N : Minimum citations
+ -cmax N or --citations-max N : Maximum citations
+ -ifmin N or --if-min N : Minimum impact factor
+ -ifmax N or --if-max N : Maximum impact factor
+
+ Args:
+ query: Query string with shell-style operators
+
+ Returns
+ -------
+ SearchQueryParser instance with parsed fields set
+
+ Example:
+ parser = SearchQueryParser.from_shell_syntax(
+ "hippocampus -t theta -a -Smith -ymin 2020 -cmin 50"
+ )
+ """
+ # Create instance without running the standard _parse() on the raw query.
+ # We do this by initialising with an empty string and then setting
+ # original_query and the parsed fields manually.
+ instance = cls.__new__(cls)
+ instance.original_query = query
+ instance.positive_keywords = []
+ instance.negative_keywords = []
+ instance.year_start = None
+ instance.year_end = None
+ instance.min_impact_factor = None
+ instance.max_impact_factor = None
+ instance.min_citations = None
+ instance.max_citations = None
+ instance.open_access = None
+ instance.document_type = None
+ instance.title_includes = []
+ instance.title_excludes = []
+ instance.author_includes = []
+ instance.author_excludes = []
+ instance.journal_includes = []
+ instance.journal_excludes = []
+
+ if not query:
+ return instance
+
+ remaining = query
+
+ # Text filters: -t/-a/-j (value may be prefixed with - for exclude)
+ text_patterns = [
+ (r'(?:-t|--title)\s+(-?)([^\s]+|"[^"]+"|\'[^\']+\')', "title"),
+ (r'(?:-a|--author)\s+(-?)([^\s]+|"[^"]+"|\'[^\']+\')', "author"),
+ (r'(?:-j|--journal)\s+(-?)([^\s]+|"[^"]+"|\'[^\']+\')', "journal"),
+ ]
+
+ for pattern, field_name in text_patterns:
+ for match in re.finditer(pattern, remaining, re.IGNORECASE):
+ is_exclude = match.group(1) == "-"
+ value = match.group(2).strip("\"'")
+ if is_exclude:
+ getattr(instance, f"{field_name}_excludes").append(value)
+ else:
+ getattr(instance, f"{field_name}_includes").append(value)
+ remaining = re.sub(pattern, "", remaining, flags=re.IGNORECASE)
+
+ # Numeric filters
+ numeric_patterns = [
+ (r"(?:-ymin|--year-min)\s+(\d{4})", "year_min"),
+ (r"(?:-ymax|--year-max)\s+(\d{4})", "year_max"),
+ (r"(?:-cmin|--citations-min)\s+(\d+)", "citations_min"),
+ (r"(?:-cmax|--citations-max)\s+(\d+)", "citations_max"),
+ (r"(?:-ifmin|--if-min)\s+(\d+(?:\.\d+)?)", "impact_factor_min"),
+ (r"(?:-ifmax|--if-max)\s+(\d+(?:\.\d+)?)", "impact_factor_max"),
+ ]
+
+ field_mapping = {
+ "year_min": "year_start",
+ "year_max": "year_end",
+ "citations_min": "min_citations",
+ "citations_max": "max_citations",
+ "impact_factor_min": "min_impact_factor",
+ "impact_factor_max": "max_impact_factor",
+ }
+
+ for pattern, field_name in numeric_patterns:
+ match = re.search(pattern, remaining, re.IGNORECASE)
+ if match:
+ raw_value = match.group(1)
+ if "impact_factor" in field_name:
+ value = float(raw_value)
+ elif "year" in field_name:
+ value = int(raw_value)
+ else:
+ value = int(raw_value)
+ attr_name = field_mapping[field_name]
+ setattr(instance, attr_name, value)
+ remaining = re.sub(pattern, "", remaining, flags=re.IGNORECASE)
+
+ # Remaining text becomes positive keywords
+ words = remaining.split()
+ instance.positive_keywords = [w.strip() for w in words if w.strip()]
+
+ return instance
+
def get_api_filters(self) -> Dict[str, Any]:
"""Get filters that can be pushed to API level."""
api_filters = {}
diff --git a/src/scitex/scholar/storage/_LibraryManager.py b/src/scitex/scholar/storage/_LibraryManager.py
index eac2073e4..2288f540a 100755
--- a/src/scitex/scholar/storage/_LibraryManager.py
+++ b/src/scitex/scholar/storage/_LibraryManager.py
@@ -24,6 +24,7 @@
from __future__ import annotations
+from pathlib import Path
from typing import Optional
from scitex import logging
@@ -111,12 +112,22 @@ def __init__(
project: str = None,
single_doi_resolver=None,
config: Optional[ScholarConfig] = None,
+ project_dir=None,
):
- """Initialize library manager."""
+ """Initialize library manager.
+
+ Parameters
+ ----------
+ project_dir : str or Path, optional
+ Root of the user's code project (e.g. ``~/my-project``).
+ When provided, project-local symlinks are also created at
+ ``{project_dir}/scitex/scholar/library/{project}/``.
+ """
self.config = config or ScholarConfig()
self.project = self.config.resolve("project", project)
self.library_master_dir = self.config.path_manager.get_library_master_dir()
self.single_doi_resolver = single_doi_resolver
+ self.project_dir = Path(project_dir) if project_dir else None
self._source_filename = "papers"
self.dedup_manager = DeduplicationManager(config=self.config)
diff --git a/src/scitex/scholar/storage/__init__.py b/src/scitex/scholar/storage/__init__.py
index 3153ac2d7..7f1a15c76 100755
--- a/src/scitex/scholar/storage/__init__.py
+++ b/src/scitex/scholar/storage/__init__.py
@@ -24,6 +24,7 @@
)
from ._LibraryCacheManager import LibraryCacheManager
from ._LibraryManager import LibraryManager
+from ._search_filename import normalize_search_filename
from .BibTeXHandler import BibTeXHandler
from .PaperIO import PaperIO
from .ScholarLibrary import ScholarLibrary
@@ -39,4 +40,5 @@
"validate_bibtex_file",
"validate_bibtex_content",
"PaperIO",
+ "normalize_search_filename",
]
diff --git a/src/scitex/scholar/storage/_mixins/_paper_saving.py b/src/scitex/scholar/storage/_mixins/_paper_saving.py
index ef49f735d..c364c7903 100755
--- a/src/scitex/scholar/storage/_mixins/_paper_saving.py
+++ b/src/scitex/scholar/storage/_mixins/_paper_saving.py
@@ -194,7 +194,7 @@ def save_resolved_paper(
logger.success(f"Saved paper to MASTER Scholar library: {paper_id}")
- # Create project symlink if needed
+ # Create project symlinks if needed
if self.project and self.project not in ["master", "MASTER"]:
try:
readable_name = self._generate_readable_name(
@@ -204,11 +204,17 @@ def save_resolved_paper(
year=year,
journal=journal,
)
+ # ~/.scitex/scholar/library/{project}/ view
self._create_project_symlink(
master_storage_path=master_storage_path,
project=self.project,
readable_name=readable_name,
)
+ # {project_dir}/scitex/scholar/library/{project}/ view
+ self._create_project_local_symlink(
+ master_storage_path=master_storage_path,
+ readable_name=readable_name,
+ )
except Exception as exc_:
logger.error(f"Failed to create symlink for {paper_id}: {exc_}")
diff --git a/src/scitex/scholar/storage/_mixins/_symlink_handlers.py b/src/scitex/scholar/storage/_mixins/_symlink_handlers.py
index 433a74efb..fdcc5cf21 100755
--- a/src/scitex/scholar/storage/_mixins/_symlink_handlers.py
+++ b/src/scitex/scholar/storage/_mixins/_symlink_handlers.py
@@ -188,6 +188,66 @@ def _create_project_symlink(
logger.warning(f"Failed to create project symlink: {exc_}")
return None
+ def _create_project_local_symlink(
+ self,
+ master_storage_path: Path,
+ readable_name: str,
+ ) -> Optional[Path]:
+ """Create symlink inside the project's own directory tree.
+
+ Target location: ``{project_dir}/scitex/scholar/library/{project}/{readable_name}``
+ Target of symlink: absolute path to master storage entry.
+
+ This mirrors the ``~/.scitex/scholar/library/{project}/`` view directly
+ inside the user's code project so papers are visible alongside source code.
+
+ Args:
+ master_storage_path: Absolute path to the MASTER entry directory.
+ readable_name: Human-readable symlink name (PDF-xx_CC-... format).
+
+ Returns
+ -------
+ Path to the created symlink, or None on failure.
+ """
+ if not getattr(self, "project_dir", None):
+ return None
+ if not self.project or self.project in ("master", "MASTER"):
+ return None
+
+ try:
+ local_lib = (
+ Path(self.project_dir) / "scitex" / "scholar" / "library" / self.project
+ )
+ local_lib.mkdir(parents=True, exist_ok=True)
+
+ symlink_path = local_lib / readable_name
+
+ # Remove stale symlinks pointing to the same master entry
+ master_id = master_storage_path.name
+ for existing in local_lib.iterdir():
+ if not existing.is_symlink():
+ continue
+ try:
+ if (
+ existing.resolve().name == master_id
+ and existing.name != readable_name
+ ):
+ existing.unlink()
+ except Exception:
+ pass
+
+ if not symlink_path.exists():
+ # Use absolute path — relative would break across project moves
+ symlink_path.symlink_to(master_storage_path.resolve())
+ logger.success(
+ f"Created project-local symlink: {symlink_path} -> {master_storage_path}"
+ )
+ return symlink_path
+
+ except Exception as exc_:
+ logger.warning(f"Failed to create project-local symlink: {exc_}")
+ return None
+
def _ensure_project_symlink(
self,
title: str,
diff --git a/src/scitex/scholar/storage/_search_filename.py b/src/scitex/scholar/storage/_search_filename.py
new file mode 100755
index 000000000..8d0d6082d
--- /dev/null
+++ b/src/scitex/scholar/storage/_search_filename.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+# File: src/scitex/scholar/storage/_search_filename.py
+
+"""Generate normalized filenames for saved search results.
+
+Format: ``-.``
+
+Example::
+
+ from scitex.scholar.storage import normalize_search_filename
+
+ fname = normalize_search_filename("hippocampus theta year:2020-2024")
+ # -> "20260218-083000-hippocampus-theta-2020-2024.bib"
+"""
+
+import re
+from datetime import datetime
+
+
+def normalize_search_filename(query: str, extension: str = ".bib") -> str:
+ """Generate a timestamped, normalized filename from a search query.
+
+ Encodes positive keywords and active filters using hyphens.
+ Timestamp prefix ensures files sort chronologically.
+
+ Args:
+ query: Raw search query string (colon-syntax or plain keywords).
+ extension: File extension to append (default: '.bib').
+
+ Returns
+ -------
+ Filename string, e.g.
+ ``20260218-083000-hippocampus-theta-2020-2024.bib``
+
+ Examples
+ --------
+ >>> normalize_search_filename("hippocampus sharp wave year:2020-2024") # doctest: +ELLIPSIS
+ '...-hippocampus-sharp-wave-2020-2024.bib'
+
+ >>> normalize_search_filename("neural network if:>5") # doctest: +ELLIPSIS
+ '...-neural-network-if5.bib'
+ """
+ # Import here to avoid circular dependency (storage <- pipelines <- storage)
+ from ..pipelines.SearchQueryParser import SearchQueryParser
+
+ parser = SearchQueryParser(query) if query else None
+
+ parts = []
+
+ if parser:
+ # Keywords (positive only)
+ for kw in parser.positive_keywords:
+ safe = re.sub(r"[^a-z0-9]+", "-", kw.lower()).strip("-")
+ if safe:
+ parts.append(safe)
+
+ # Year range
+ if parser.year_start and parser.year_end:
+ parts.append(f"{parser.year_start}-{parser.year_end}")
+ elif parser.year_start:
+ parts.append(f"from{parser.year_start}")
+ elif parser.year_end:
+ parts.append(f"to{parser.year_end}")
+
+ # Impact factor
+ if parser.min_impact_factor is not None:
+ val = (
+ int(parser.min_impact_factor)
+ if parser.min_impact_factor == int(parser.min_impact_factor)
+ else parser.min_impact_factor
+ )
+ parts.append(f"if{val}")
+
+ # Citations
+ if parser.min_citations is not None:
+ parts.append(f"c{parser.min_citations}")
+
+ # Open access
+ if parser.open_access:
+ parts.append("oa")
+
+ # Document type
+ if parser.document_type:
+ parts.append(parser.document_type)
+
+ stem = "-".join(parts) if parts else "search"
+ stem = re.sub(r"-+", "-", stem).strip("-")
+
+ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+ ext = extension if extension.startswith(".") else f".{extension}"
+ return f"{timestamp}-{stem}{ext}"
diff --git a/src/scitex/types/_ArrayLike.py b/src/scitex/types/_ArrayLike.py
index 2f9619110..1636243cf 100755
--- a/src/scitex/types/_ArrayLike.py
+++ b/src/scitex/types/_ArrayLike.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
# Timestamp: "2025-05-01 09:21:23 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex_repo/src/scitex/types/_ArrayLike.py
# ----------------------------------------
@@ -15,7 +14,14 @@
import numpy as _np
import pandas as _pd
-import xarray as _xr
+
+try:
+ import xarray as _xr
+
+ XARRAY_AVAILABLE = True
+except ImportError:
+ XARRAY_AVAILABLE = False
+ _xr = None
def _get_torch_tensor_type():
@@ -29,29 +35,38 @@ def _get_torch_tensor_type():
return type(None)
-ArrayLike = _Union[
- _List,
- _Tuple,
- _np.ndarray,
- _pd.Series,
- _pd.DataFrame,
- _xr.DataArray,
-]
+if XARRAY_AVAILABLE:
+ ArrayLike = _Union[
+ _List,
+ _Tuple,
+ _np.ndarray,
+ _pd.Series,
+ _pd.DataFrame,
+ _xr.DataArray,
+ ]
+else:
+ ArrayLike = _Union[
+ _List,
+ _Tuple,
+ _np.ndarray,
+ _pd.Series,
+ _pd.DataFrame,
+ ]
def is_array_like(obj) -> bool:
"""Check if object is array-like.
- Returns:
+ Returns
+ -------
bool: True if object is array-like, False otherwise.
"""
# First check against non-torch types
- is_standard_array = isinstance(
- obj,
- (_List, _Tuple, _np.ndarray, _pd.Series, _pd.DataFrame, _xr.DataArray),
- )
+ base_types = [_List, _Tuple, _np.ndarray, _pd.Series, _pd.DataFrame]
+ if XARRAY_AVAILABLE:
+ base_types.append(_xr.DataArray)
- if is_standard_array:
+ if isinstance(obj, tuple(base_types)):
return True
# Check torch tensor lazily to avoid circular imports
diff --git a/src/scitex/writer/__init__.py b/src/scitex/writer/__init__.py
index f2e8a6284..1c247afdc 100755
--- a/src/scitex/writer/__init__.py
+++ b/src/scitex/writer/__init__.py
@@ -58,6 +58,7 @@
bib,
compile,
ensure_workspace,
+ export,
figures,
guidelines,
project,
@@ -97,6 +98,7 @@ def __getattr__(self, name):
bib = None
compile = None
ensure_workspace = None
+ export = None
figures = None
guidelines = None
project = None
@@ -131,6 +133,7 @@ def has_writer() -> bool:
# Modules
"bib",
"compile",
+ "export",
"figures",
"guidelines",
"project",
diff --git a/tests/scitex/scholar/integration/zotero/test_local_reader.py b/tests/scitex/scholar/integration/zotero/test_local_reader.py
new file mode 100755
index 000000000..d43a65482
--- /dev/null
+++ b/tests/scitex/scholar/integration/zotero/test_local_reader.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+"""Tests for ZoteroLocalReader and export_for_zotero.
+
+Tests use the actual ~/Zotero/zotero.sqlite when present.
+All tests skip gracefully when no local Zotero database is found.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+_LINUX_DB = Path("~/Zotero/zotero.sqlite").expanduser()
+_WINDOWS_DB = Path("/mnt/c/Users/wyusu/Zotero/zotero.sqlite")
+
+pytestmark = pytest.mark.skipif(
+ not _LINUX_DB.exists(),
+ reason="No local Zotero database at ~/Zotero/zotero.sqlite",
+)
+
+
+# ── Fixtures ──────────────────────────────────────────────────────────────────
+
+
+@pytest.fixture(scope="module")
+def reader():
+ from scitex.scholar.integration.zotero import ZoteroLocalReader
+
+ return ZoteroLocalReader()
+
+
+@pytest.fixture(scope="module")
+def all_papers(reader):
+ return reader.read_all()
+
+
+# ── Path detection ────────────────────────────────────────────────────────────
+
+
+def test_detect_db_path_linux():
+ from scitex.scholar.integration.zotero import ZoteroLocalReader
+
+ r = ZoteroLocalReader()
+ assert r.db_path.exists()
+ assert r.db_path.suffix == ".sqlite"
+
+
+def test_explicit_db_path():
+ from scitex.scholar.integration.zotero import ZoteroLocalReader
+
+ r = ZoteroLocalReader(db_path=str(_LINUX_DB))
+ assert r.db_path == _LINUX_DB
+
+
+# ── read_all ─────────────────────────────────────────────────────────────────
+
+
+def test_read_all_returns_papers(all_papers):
+ from scitex.scholar.core.Papers import Papers
+
+ assert isinstance(all_papers, Papers)
+
+
+def test_read_all_count(all_papers):
+ # Linux DB has 49 items — at least a few must load
+ assert len(all_papers) >= 1
+
+
+def test_read_all_titles_not_empty(all_papers):
+ titles = [p.metadata.basic.title for p in all_papers if p.metadata.basic.title]
+ assert len(titles) >= 1
+
+
+def test_read_all_has_authors(all_papers):
+ papers_with_authors = [p for p in all_papers if p.metadata.basic.authors]
+ assert len(papers_with_authors) >= 1
+
+
+def test_read_all_with_limit(reader):
+ papers = reader.read_all(limit=3)
+ assert len(papers) <= 3
+
+
+# ── read_by_tags ──────────────────────────────────────────────────────────────
+
+
+def test_read_by_tags_returns_subset(reader, all_papers):
+ # "Epilepsy" tag is known to exist in the Linux DB
+ epilepsy_papers = reader.read_by_tags(["Epilepsy"])
+ assert len(epilepsy_papers) >= 1
+ assert len(epilepsy_papers) <= len(all_papers)
+
+
+def test_read_by_tags_any(reader):
+ # OR logic: items with either tag
+ papers = reader.read_by_tags(["Epilepsy", "EEG"], match_all=False)
+ assert len(papers) >= 1
+
+
+def test_read_by_tags_all(reader):
+ # AND logic: items with BOTH tags (may be 0 if no overlap)
+ papers = reader.read_by_tags(["Epilepsy", "EEG"], match_all=True)
+ assert isinstance(papers.papers, list) # result is valid, even if empty
+
+
+def test_read_by_tags_nonexistent(reader):
+ papers = reader.read_by_tags(["NonExistentTag_XYZ_999"])
+ assert len(papers) == 0
+
+
+# ── read_by_collection ────────────────────────────────────────────────────────
+
+
+def test_read_by_collection_nonexistent(reader):
+ # Linux DB has 0 collections; should return empty Papers, not raise
+ papers = reader.read_by_collection("NonExistentCollection")
+ assert len(papers) == 0
+
+
+# ── export_for_zotero ─────────────────────────────────────────────────────────
+
+
+def test_export_for_zotero_bibtex(all_papers, tmp_path):
+ from scitex.scholar.integration.zotero import export_for_zotero
+
+ out = tmp_path / "export.bib"
+ result = export_for_zotero(all_papers, out, fmt="bibtex")
+
+ assert result == out
+ assert out.exists()
+ content = out.read_text()
+ assert "@" in content # at least one BibTeX entry
+
+
+def test_export_for_zotero_ris(all_papers, tmp_path):
+ from scitex.scholar.integration.zotero import export_for_zotero
+
+ out = tmp_path / "export.ris"
+ result = export_for_zotero(all_papers, out, fmt="ris")
+
+ assert result == out
+ assert out.exists()
+ content = out.read_text()
+ assert "TY -" in content # at least one RIS entry
+
+
+def test_export_roundtrip_titles(all_papers, tmp_path):
+ """Titles present in Papers appear in BibTeX output."""
+ from scitex.scholar.integration.zotero import export_for_zotero
+
+ out = tmp_path / "roundtrip.bib"
+ export_for_zotero(all_papers, out, fmt="bibtex")
+
+ content = out.read_text()
+ titles = [p.metadata.basic.title for p in all_papers if p.metadata.basic.title]
+ # At least one title should appear (partially) in the output
+ assert any(t[:20] in content for t in titles if len(t) >= 20)
+
+
+# ── Windows WSL path ──────────────────────────────────────────────────────────
+
+
+@pytest.mark.skipif(
+ not _WINDOWS_DB.exists(),
+ reason="Windows Zotero DB not accessible at /mnt/c/Users/wyusu/Zotero/",
+)
+def test_windows_db_read():
+ from scitex.scholar.integration.zotero import ZoteroLocalReader
+
+ r = ZoteroLocalReader(db_path=_WINDOWS_DB)
+ papers = r.read_all(limit=10)
+ assert len(papers) >= 1
+ assert len(papers) <= 10
+
+
+# EOF
diff --git a/tests/scitex/scholar/storage/test__search_filename_and_symlink.py b/tests/scitex/scholar/storage/test__search_filename_and_symlink.py
new file mode 100755
index 000000000..39b886580
--- /dev/null
+++ b/tests/scitex/scholar/storage/test__search_filename_and_symlink.py
@@ -0,0 +1,809 @@
+#!/usr/bin/env python3
+# Timestamp: "2026-02-18"
+# File: tests/scitex/scholar/storage/test__search_filename_and_symlink.py
+# ----------------------------------------
+
+"""
+Comprehensive tests for normalize_search_filename and _create_project_local_symlink.
+
+Feature 1: normalize_search_filename
+- Generates timestamped filenames from search queries
+- Format: YYYYMMDD-HHMMSS-{normalized-query}.{ext}
+- Uses SearchQueryParser to extract filters
+
+Feature 2: _create_project_local_symlink
+- Creates symlinks at {project_dir}/scitex/scholar/library/{project}/{readable_name}
+- Symlink target is absolute path to master_storage_path
+- Removes stale symlinks pointing to same master entry with different names
+"""
+
+import importlib.util
+import json
+import os
+import sys
+from datetime import datetime
+from pathlib import Path
+from unittest.mock import MagicMock, Mock, patch
+
+import pytest
+
+# ============================================================================
+# Module Loading Helpers
+# ============================================================================
+
+
+def load_module(name, path):
+ """Load a module from file path using importlib.util."""
+ spec = importlib.util.spec_from_file_location(name, path)
+ if spec is None or spec.loader is None:
+ raise ImportError(f"Cannot load spec for {name} from {path}")
+ mod = importlib.util.module_from_spec(spec)
+ sys.modules[name] = mod
+ spec.loader.exec_module(mod)
+ return mod
+
+
+PROJECT_ROOT = Path(
+ __file__
+).parent.parent.parent.parent.parent # tests/scitex/scholar/storage/ -> project root
+
+
+@pytest.fixture(scope="session")
+def search_query_parser_module():
+ """Load SearchQueryParser module once per session."""
+ module_path = PROJECT_ROOT / "src/scitex/scholar/pipelines/SearchQueryParser.py"
+ return load_module("scitex.scholar.pipelines.SearchQueryParser", str(module_path))
+
+
+@pytest.fixture(scope="session")
+def search_filename_module():
+ """Load _search_filename module once per session."""
+ module_path = PROJECT_ROOT / "src/scitex/scholar/storage/_search_filename.py"
+ return load_module("scitex.scholar.storage._search_filename", str(module_path))
+
+
+@pytest.fixture(scope="session")
+def symlink_handlers_module():
+ """Load _symlink_handlers module once per session (avoids full scitex import chain)."""
+ module_path = (
+ PROJECT_ROOT / "src/scitex/scholar/storage/_mixins/_symlink_handlers.py"
+ )
+ return load_module(
+ "scitex.scholar.storage._mixins._symlink_handlers", str(module_path)
+ )
+
+
+@pytest.fixture(scope="session")
+def SymlinkHandlersMixin(symlink_handlers_module):
+ """Get the SymlinkHandlersMixin class."""
+ return symlink_handlers_module.SymlinkHandlersMixin
+
+
+@pytest.fixture
+def normalize_search_filename(search_filename_module):
+ """Get the normalize_search_filename function."""
+ return search_filename_module.normalize_search_filename
+
+
+@pytest.fixture
+def SearchQueryParser(search_query_parser_module):
+ """Get the SearchQueryParser class."""
+ return search_query_parser_module.SearchQueryParser
+
+
+# ============================================================================
+# Feature 1: normalize_search_filename Tests
+# ============================================================================
+
+
+class TestNormalizeSearchFilenameBasics:
+ """Test basic functionality of normalize_search_filename."""
+
+ def test_empty_query_returns_search_bib(self, normalize_search_filename):
+ """Empty query should return filename with 'search' as stem."""
+ result = normalize_search_filename("")
+ # Format: YYYYMMDD-HHMMSS-search.bib
+ assert result.endswith("-search.bib")
+ # Check timestamp prefix (YYYYMMDD-HHMMSS)
+ parts = result.split("-")
+ assert len(parts) >= 3
+ assert len(parts[0]) == 8 # YYYYMMDD
+ assert len(parts[1]) == 6 # HHMMSS
+
+ def test_simple_keywords_with_hyphens(self, normalize_search_filename):
+ """Simple keywords should be joined with hyphens."""
+ result = normalize_search_filename("hippocampus theta")
+ assert "hippocampus-theta" in result
+ assert ".bib" in result
+ # Should NOT have underscores
+ assert "_" not in result.split("-search")[0]
+
+ def test_keywords_converted_to_lowercase(self, normalize_search_filename):
+ """Keywords should be converted to lowercase."""
+ result = normalize_search_filename("HIPPOCAMPUS Sharp WAVE")
+ assert "hippocampus-sharp-wave" in result
+ # Verify no uppercase letters in the normalized part
+ stem = (
+ result.split("-search")[0] if "-search" in result else result.split(".")[0]
+ )
+ normalized_part = "-".join(stem.split("-")[2:]) # Skip timestamp
+ assert normalized_part.islower()
+
+ def test_special_characters_removed(self, normalize_search_filename):
+ """Special characters should be removed, words joined with hyphens."""
+ result = normalize_search_filename("neural@network signal-processing")
+ # special chars removed, hyphens preserved for word separation
+ assert "-" in result
+ assert "@" not in result
+
+ def test_custom_extension(self, normalize_search_filename):
+ """Should support custom file extensions."""
+ result_json = normalize_search_filename("test query", extension=".json")
+ assert result_json.endswith(".json")
+
+ result_csv = normalize_search_filename("test query", extension="csv")
+ assert result_csv.endswith(".csv")
+
+ result_txt = normalize_search_filename("test query", extension="txt")
+ assert result_txt.endswith(".txt")
+
+ def test_extension_format_normalization(self, normalize_search_filename):
+ """Extension should work with or without leading dot."""
+ result_with_dot = normalize_search_filename("query", extension=".bib")
+ result_without_dot = normalize_search_filename("query", extension="bib")
+
+ # Both should end with .bib
+ assert result_with_dot.endswith(".bib")
+ assert result_without_dot.endswith(".bib")
+
+
+class TestNormalizeSearchFilenameTimestamp:
+ """Test timestamp generation in normalize_search_filename."""
+
+ def test_timestamp_format_yyyymmdd_hhmmss(self, normalize_search_filename):
+ """Timestamp should be YYYYMMDD-HHMMSS format."""
+ result = normalize_search_filename("test")
+ # Extract timestamp (first two hyphen-separated parts)
+ parts = result.split("-", 2)
+ assert len(parts) >= 2
+
+ date_part = parts[0]
+ time_part = parts[1]
+
+ # Check date format (YYYYMMDD)
+ assert len(date_part) == 8
+ assert date_part.isdigit()
+ year = int(date_part[:4])
+ month = int(date_part[4:6])
+ day = int(date_part[6:8])
+ assert 2000 <= year <= 2100
+ assert 1 <= month <= 12
+ assert 1 <= day <= 31
+
+ # Check time format (HHMMSS)
+ assert len(time_part) == 6
+ assert time_part.isdigit()
+ hour = int(time_part[:2])
+ minute = int(time_part[2:4])
+ second = int(time_part[4:6])
+ assert 0 <= hour <= 23
+ assert 0 <= minute <= 59
+ assert 0 <= second <= 59
+
+ def test_timestamp_is_reasonable(self, normalize_search_filename):
+ """Timestamp should be close to current time."""
+ before = datetime.now()
+ result = normalize_search_filename("query")
+ after = datetime.now()
+
+ # Extract timestamp
+ timestamp_str = result.split("-", 2)[0] + "-" + result.split("-", 2)[1]
+ timestamp = datetime.strptime(timestamp_str, "%Y%m%d-%H%M%S")
+
+ # Timestamp should be within a reasonable range (strip microseconds; strptime gives second precision)
+ assert before.replace(microsecond=0) <= timestamp <= after
+
+
+class TestNormalizeSearchFilenameFilters:
+ """Test filter encoding in normalize_search_filename."""
+
+ def test_year_range_encoding(self, normalize_search_filename):
+ """Year range should be encoded as YYYY-YYYY."""
+ result = normalize_search_filename("query year:2020-2024")
+ assert "2020-2024" in result
+
+ def test_year_start_only_encoding(self, normalize_search_filename):
+ """Year start only should be encoded as from{YYYY}."""
+ result = normalize_search_filename("query year:>2020")
+ assert "from2020" in result
+
+ def test_year_end_only_encoding(self, normalize_search_filename):
+ """Year end only should be encoded as to{YYYY}."""
+ result = normalize_search_filename("query year:<2024")
+ assert "to2024" in result
+
+ def test_impact_factor_encoding(self, normalize_search_filename):
+ """Impact factor should be encoded as if{value}."""
+ result = normalize_search_filename("query if:>5")
+ assert "if5" in result
+
+ result_decimal = normalize_search_filename("query if:>5.5")
+ assert "if5.5" in result_decimal
+
+ def test_citation_count_encoding(self, normalize_search_filename):
+ """Citation count should be encoded as c{count}."""
+ result = normalize_search_filename("query citations:>100")
+ assert "c100" in result
+
+ result_alt = normalize_search_filename("query citation:>50")
+ assert "c50" in result_alt
+
+ def test_open_access_encoding(self, normalize_search_filename):
+ """Open access should be encoded as 'oa'."""
+ result = normalize_search_filename("query open_access:true")
+ assert "oa" in result
+
+ result_alt = normalize_search_filename("query oa:yes")
+ assert "oa" in result_alt
+
+ def test_document_type_encoding(self, normalize_search_filename):
+ """Document type should be appended to filename."""
+ result = normalize_search_filename("query type:article")
+ assert "article" in result
+
+ result_review = normalize_search_filename("query type:review")
+ assert "review" in result_review
+
+ def test_complex_query_with_multiple_filters(self, normalize_search_filename):
+ """Complex query should encode all filters correctly."""
+ result = normalize_search_filename(
+ "hippocampus neural network year:2020-2024 if:>5 citations:>100 oa:true type:article"
+ )
+ # Should contain all key elements
+ assert "hippocampus" in result
+ assert "neural-network" in result
+ assert "2020-2024" in result
+ assert "if5" in result
+ assert "c100" in result
+ assert "oa" in result
+ assert "article" in result
+
+
+class TestNormalizeSearchFilenameMixins:
+ """Test mixin behavior and edge cases."""
+
+ def test_multiple_spaces_collapsed(self, normalize_search_filename):
+ """Multiple spaces should be handled gracefully."""
+ result1 = normalize_search_filename("keyword1 keyword2")
+ result2 = normalize_search_filename("keyword1 keyword2")
+ # Both should produce same normalized part
+ assert (
+ result1.split(".")[0].split("-")[-2:]
+ == result2.split(".")[0].split("-")[-2:]
+ )
+
+ def test_leading_trailing_whitespace_ignored(self, normalize_search_filename):
+ """Leading and trailing whitespace should be ignored."""
+ result1 = normalize_search_filename(" query ")
+ result2 = normalize_search_filename("query")
+ # Normalized parts should match
+ assert (
+ result1.split(".")[0].split("-")[-1] == result2.split(".")[0].split("-")[-1]
+ )
+
+ def test_quoted_phrases_treated_as_single_keyword(self, normalize_search_filename):
+ """Quoted phrases should be treated as single keywords with hyphens."""
+ result = normalize_search_filename('"sharp wave" ripple')
+ # Phrase should be present (possibly hyphenated)
+ assert "sharp" in result
+ assert "wave" in result
+ assert "ripple" in result
+
+ def test_negative_keywords_excluded(self, normalize_search_filename):
+ """Negative keywords (prefixed with -) should not appear in filename."""
+ result = normalize_search_filename("hippocampus -seizure -epilepsy")
+ assert "hippocampus" in result
+ # Negative keywords should not be in the filename
+ assert "seizure" not in result
+ assert "epilepsy" not in result
+
+ def test_hyphen_collapsing(self, normalize_search_filename):
+ """Multiple consecutive hyphens should be collapsed."""
+ # This tests the internal regex cleanup
+ result = normalize_search_filename("query")
+ # Should have only single hyphens between components
+ timestamp_sep = result.count("---")
+ # Should not have triple hyphens in the normalized part
+ assert timestamp_sep == 0
+
+ def test_leading_trailing_hyphens_stripped(self, normalize_search_filename):
+ """Leading and trailing hyphens in normalized part should be stripped."""
+ result = normalize_search_filename("test")
+ # Split on dots to get filename without extension
+ filename = result.split(".")[0]
+ # Extract normalized part (skip YYYYMMDD-HHMMSS-)
+ normalized = "-".join(filename.split("-")[2:])
+ # Should not start or end with hyphen
+ assert not normalized.startswith("-")
+ assert not normalized.endswith("-")
+
+
+# ============================================================================
+# Feature 2: _create_project_local_symlink Tests
+# ============================================================================
+
+
+class TestCreateProjectLocalSymlinkBasics:
+ """Test basic symlink creation functionality."""
+
+ @pytest.fixture
+ def mixin_instance(self, tmp_path, SymlinkHandlersMixin):
+ """Create a minimal instance with the mixin."""
+
+ class FakeLibraryManager(SymlinkHandlersMixin):
+ def __init__(self, project=None, project_dir=None):
+ self.project = project
+ self.project_dir = project_dir
+
+ return FakeLibraryManager()
+
+ def test_returns_none_when_project_dir_not_set(self, mixin_instance, tmp_path):
+ """Should return None if project_dir is not set."""
+ mixin_instance.project = "test_project"
+ mixin_instance.project_dir = None
+
+ master_path = tmp_path / "master" / "ABC123"
+ master_path.mkdir(parents=True)
+
+ result = mixin_instance._create_project_local_symlink(
+ master_path, "readable_name"
+ )
+ assert result is None
+
+ def test_returns_none_for_master_project(self, mixin_instance, tmp_path):
+ """Should return None when project is 'master'."""
+ mixin_instance.project = "master"
+ mixin_instance.project_dir = tmp_path / "project"
+ mixin_instance.project_dir.mkdir(parents=True)
+
+ master_path = tmp_path / "master" / "ABC123"
+ master_path.mkdir(parents=True)
+
+ result = mixin_instance._create_project_local_symlink(
+ master_path, "readable_name"
+ )
+ assert result is None
+
+ def test_returns_none_for_master_uppercase(self, mixin_instance, tmp_path):
+ """Should return None when project is 'MASTER' (uppercase)."""
+ mixin_instance.project = "MASTER"
+ mixin_instance.project_dir = tmp_path / "project"
+ mixin_instance.project_dir.mkdir(parents=True)
+
+ master_path = tmp_path / "master" / "ABC123"
+ master_path.mkdir(parents=True)
+
+ result = mixin_instance._create_project_local_symlink(
+ master_path, "readable_name"
+ )
+ assert result is None
+
+ def test_creates_symlink_at_correct_path(self, mixin_instance, tmp_path):
+ """Should create symlink at {project_dir}/scitex/scholar/library/{project}/{readable_name}."""
+ mixin_instance.project = "test_project"
+ mixin_instance.project_dir = tmp_path / "project"
+
+ master_path = tmp_path / "master" / "ABC123"
+ master_path.mkdir(parents=True)
+
+ readable_name = "PDF-01_CC-000100_IF-005_2024_Smith_Nature"
+ result = mixin_instance._create_project_local_symlink(
+ master_path, readable_name
+ )
+
+ # Check that symlink was created at expected location
+ expected_path = (
+ tmp_path
+ / "project"
+ / "scitex"
+ / "scholar"
+ / "library"
+ / "test_project"
+ / readable_name
+ )
+ assert result == expected_path
+ assert expected_path.exists()
+ assert expected_path.is_symlink()
+
+ def test_symlink_target_is_absolute(self, mixin_instance, tmp_path):
+ """Symlink target should be absolute path to master storage."""
+ mixin_instance.project = "test_project"
+ mixin_instance.project_dir = tmp_path / "project"
+
+ master_path = tmp_path / "master" / "ABC123"
+ master_path.mkdir(parents=True)
+
+ readable_name = "Paper_Name"
+ symlink_path = mixin_instance._create_project_local_symlink(
+ master_path, readable_name
+ )
+
+ # Resolve symlink target
+ target = symlink_path.resolve()
+
+ # Target should be the absolute path to master storage
+ assert target == master_path.resolve()
+ assert target.is_absolute()
+
+ def test_creates_parent_directories(self, mixin_instance, tmp_path):
+ """Should create parent directories if they don't exist."""
+ mixin_instance.project = "new_project"
+ mixin_instance.project_dir = tmp_path / "project"
+
+ master_path = tmp_path / "master" / "DEF456"
+ master_path.mkdir(parents=True)
+
+ result = mixin_instance._create_project_local_symlink(master_path, "paper_name")
+
+ # Check that parent directory structure was created
+ parent_dir = (
+ tmp_path / "project" / "scitex" / "scholar" / "library" / "new_project"
+ )
+ assert parent_dir.exists()
+ assert parent_dir.is_dir()
+
+
+class TestCreateProjectLocalSymlinkStaleSymlinks:
+ """Test stale symlink removal functionality."""
+
+ @pytest.fixture
+ def mixin_instance(self, tmp_path, SymlinkHandlersMixin):
+ """Create a minimal instance with the mixin."""
+
+ class FakeLibraryManager(SymlinkHandlersMixin):
+ def __init__(self, project=None, project_dir=None):
+ self.project = project
+ self.project_dir = project_dir
+
+ return FakeLibraryManager()
+
+ def test_removes_stale_symlink_same_master_different_name(
+ self, mixin_instance, tmp_path
+ ):
+ """Should remove stale symlink pointing to same master entry with different name."""
+ mixin_instance.project = "test_project"
+ mixin_instance.project_dir = tmp_path / "project"
+
+ master_path = tmp_path / "master" / "ABC123"
+ master_path.mkdir(parents=True)
+
+ # Create directory for symlinks
+ symlink_dir = (
+ tmp_path / "project" / "scitex" / "scholar" / "library" / "test_project"
+ )
+ symlink_dir.mkdir(parents=True)
+
+ # Create a stale symlink with different name pointing to same master
+ old_symlink = symlink_dir / "Old_Paper_Name"
+ old_symlink.symlink_to(master_path.resolve())
+
+ assert old_symlink.exists()
+
+ # Create new symlink with different name, same master target
+ new_name = "New_Paper_Name"
+ result = mixin_instance._create_project_local_symlink(master_path, new_name)
+
+ # Old symlink should be removed
+ assert not old_symlink.exists()
+
+ # New symlink should exist
+ assert result.exists()
+ assert result.name == new_name
+
+ def test_preserves_symlink_with_same_name(self, mixin_instance, tmp_path):
+ """Should not remove symlink if name matches."""
+ mixin_instance.project = "test_project"
+ mixin_instance.project_dir = tmp_path / "project"
+
+ master_path = tmp_path / "master" / "ABC123"
+ master_path.mkdir(parents=True)
+
+ symlink_dir = (
+ tmp_path / "project" / "scitex" / "scholar" / "library" / "test_project"
+ )
+ symlink_dir.mkdir(parents=True)
+
+ readable_name = "PDF-01_CC-000100_IF-005_2024_Smith_Nature"
+ symlink_path = symlink_dir / readable_name
+ symlink_path.symlink_to(master_path.resolve())
+
+ original_inode = symlink_path.lstat().st_ino
+
+ # Call method with same name
+ result = mixin_instance._create_project_local_symlink(
+ master_path, readable_name
+ )
+
+ # Symlink should still exist (not removed)
+ assert result.exists()
+ # Should point to same target
+ assert result.resolve() == master_path.resolve()
+
+ def test_ignores_non_symlink_files(self, mixin_instance, tmp_path):
+ """Should ignore non-symlink files in directory."""
+ mixin_instance.project = "test_project"
+ mixin_instance.project_dir = tmp_path / "project"
+
+ master_path = tmp_path / "master" / "ABC123"
+ master_path.mkdir(parents=True)
+
+ symlink_dir = (
+ tmp_path / "project" / "scitex" / "scholar" / "library" / "test_project"
+ )
+ symlink_dir.mkdir(parents=True)
+
+ # Create a regular file (not symlink)
+ regular_file = symlink_dir / "regular_file.txt"
+ regular_file.write_text("This is a regular file")
+
+ # Create symlink
+ result = mixin_instance._create_project_local_symlink(
+ master_path, "new_symlink"
+ )
+
+ # Regular file should still exist
+ assert regular_file.exists()
+ assert not regular_file.is_symlink()
+
+ # New symlink should be created
+ assert result.exists()
+
+ def test_handles_broken_symlinks_gracefully(self, mixin_instance, tmp_path):
+ """Should handle broken symlinks without crashing."""
+ mixin_instance.project = "test_project"
+ mixin_instance.project_dir = tmp_path / "project"
+
+ master_path = tmp_path / "master" / "ABC123"
+ master_path.mkdir(parents=True)
+
+ symlink_dir = (
+ tmp_path / "project" / "scitex" / "scholar" / "library" / "test_project"
+ )
+ symlink_dir.mkdir(parents=True)
+
+ # Create a broken symlink (target doesn't exist)
+ broken_symlink = symlink_dir / "broken_link"
+ broken_symlink.symlink_to("/nonexistent/path")
+
+ # This should not crash
+ result = mixin_instance._create_project_local_symlink(
+ master_path, "new_symlink"
+ )
+
+ assert result.exists()
+ # Broken symlink should remain (since target doesn't match)
+ assert broken_symlink.is_symlink()
+
+ def test_removes_only_matching_master_id(self, mixin_instance, tmp_path):
+ """Should only remove symlinks pointing to the same master ID."""
+ mixin_instance.project = "test_project"
+ mixin_instance.project_dir = tmp_path / "project"
+
+ master_path_1 = tmp_path / "master" / "ABC123"
+ master_path_2 = tmp_path / "master" / "DEF456"
+ master_path_1.mkdir(parents=True)
+ master_path_2.mkdir(parents=True)
+
+ symlink_dir = (
+ tmp_path / "project" / "scitex" / "scholar" / "library" / "test_project"
+ )
+ symlink_dir.mkdir(parents=True)
+
+ # Create symlinks to different masters
+ old_symlink_1 = symlink_dir / "Old_Name_1"
+ old_symlink_1.symlink_to(master_path_1.resolve())
+
+ other_symlink = symlink_dir / "Other_Master"
+ other_symlink.symlink_to(master_path_2.resolve())
+
+ # Create new symlink for master 1 with different name
+ result = mixin_instance._create_project_local_symlink(
+ master_path_1, "New_Name_1"
+ )
+
+ # Old symlink for master 1 should be removed
+ assert not old_symlink_1.exists()
+
+ # Symlink for other master should remain
+ assert other_symlink.exists()
+ assert other_symlink.resolve() == master_path_2.resolve()
+
+ # New symlink should exist
+ assert result.exists()
+ assert result.resolve() == master_path_1.resolve()
+
+
+class TestCreateProjectLocalSymlinkReturnValue:
+ """Test return values of _create_project_local_symlink."""
+
+ @pytest.fixture
+ def mixin_instance(self, tmp_path, SymlinkHandlersMixin):
+ """Create a minimal instance with the mixin."""
+
+ class FakeLibraryManager(SymlinkHandlersMixin):
+ def __init__(self, project=None, project_dir=None):
+ self.project = project
+ self.project_dir = project_dir
+
+ return FakeLibraryManager()
+
+ def test_returns_path_object_on_success(self, mixin_instance, tmp_path):
+ """Should return Path object when symlink is created successfully."""
+ mixin_instance.project = "test_project"
+ mixin_instance.project_dir = tmp_path / "project"
+
+ master_path = tmp_path / "master" / "ABC123"
+ master_path.mkdir(parents=True)
+
+ result = mixin_instance._create_project_local_symlink(master_path, "paper")
+
+ assert isinstance(result, Path)
+ assert result.exists()
+
+ def test_return_path_is_correct_path(self, mixin_instance, tmp_path):
+ """Returned path should match the created symlink path."""
+ mixin_instance.project = "test_project"
+ mixin_instance.project_dir = tmp_path / "project"
+
+ master_path = tmp_path / "master" / "ABC123"
+ master_path.mkdir(parents=True)
+ readable_name = "PDF-01_CC_IF"
+
+ result = mixin_instance._create_project_local_symlink(
+ master_path, readable_name
+ )
+
+ expected = (
+ tmp_path
+ / "project"
+ / "scitex"
+ / "scholar"
+ / "library"
+ / "test_project"
+ / readable_name
+ )
+ assert result == expected
+
+ def test_returns_none_on_missing_project_dir(self, mixin_instance, tmp_path):
+ """Should return None if project_dir is None."""
+ mixin_instance.project = "test_project"
+ mixin_instance.project_dir = None
+
+ master_path = tmp_path / "master" / "ABC123"
+ master_path.mkdir(parents=True)
+
+ result = mixin_instance._create_project_local_symlink(master_path, "paper")
+ assert result is None
+
+ def test_returns_none_on_master_project(self, mixin_instance, tmp_path):
+ """Should return None if project is 'master'."""
+ mixin_instance.project = "master"
+ mixin_instance.project_dir = tmp_path / "project"
+ mixin_instance.project_dir.mkdir(parents=True)
+
+ master_path = tmp_path / "master" / "ABC123"
+ master_path.mkdir(parents=True)
+
+ result = mixin_instance._create_project_local_symlink(master_path, "paper")
+ assert result is None
+
+
+class TestCreateProjectLocalSymlinkEdgeCases:
+ """Test edge cases and special scenarios."""
+
+ @pytest.fixture
+ def mixin_instance(self, tmp_path, SymlinkHandlersMixin):
+ """Create a minimal instance with the mixin."""
+
+ class FakeLibraryManager(SymlinkHandlersMixin):
+ def __init__(self, project=None, project_dir=None):
+ self.project = project
+ self.project_dir = project_dir
+
+ return FakeLibraryManager()
+
+ def test_handles_special_characters_in_readable_name(
+ self, mixin_instance, tmp_path
+ ):
+ """Should handle special characters in readable_name."""
+ mixin_instance.project = "test_project"
+ mixin_instance.project_dir = tmp_path / "project"
+
+ master_path = tmp_path / "master" / "ABC123"
+ master_path.mkdir(parents=True)
+
+ readable_name = "PDF-01_CC-000100_IF-005_2024_Smith-Jones_Nature-Science"
+ result = mixin_instance._create_project_local_symlink(
+ master_path, readable_name
+ )
+
+ assert result is not None
+ assert result.exists()
+ assert result.name == readable_name
+
+ def test_handles_long_readable_name(self, mixin_instance, tmp_path):
+ """Should handle long readable names."""
+ mixin_instance.project = "test_project"
+ mixin_instance.project_dir = tmp_path / "project"
+
+ master_path = tmp_path / "master" / "ABC123"
+ master_path.mkdir(parents=True)
+
+ readable_name = "PDF-01_CC-999999_IF-999_2024_VeryLongAuthorName_VeryLongJournalNameThatExceedsNormalLength"
+ result = mixin_instance._create_project_local_symlink(
+ master_path, readable_name
+ )
+
+ assert result is not None
+ assert result.exists()
+ assert result.name == readable_name
+
+ def test_handles_paths_with_spaces(self, mixin_instance, tmp_path):
+ """Should handle paths with spaces."""
+ mixin_instance.project = "test project"
+ project_dir = tmp_path / "my project"
+ mixin_instance.project_dir = project_dir
+
+ master_path = tmp_path / "master" / "ABC123"
+ master_path.mkdir(parents=True)
+
+ result = mixin_instance._create_project_local_symlink(master_path, "paper name")
+
+ assert result is not None
+ assert result.exists()
+ # Path should contain spaces correctly
+ assert "test project" in str(result)
+ assert "paper name" in str(result)
+
+ def test_handles_nested_master_paths(self, mixin_instance, tmp_path):
+ """Should handle deeply nested master storage paths."""
+ mixin_instance.project = "test_project"
+ mixin_instance.project_dir = tmp_path / "project"
+
+ # Create nested path
+ master_path = tmp_path / "archive" / "deep" / "nested" / "master" / "ABC123"
+ master_path.mkdir(parents=True)
+
+ result = mixin_instance._create_project_local_symlink(master_path, "paper")
+
+ assert result is not None
+ assert result.exists()
+ # Symlink target should resolve to the correct master path
+ assert result.resolve() == master_path.resolve()
+
+ def test_idempotency_same_call_twice(self, mixin_instance, tmp_path):
+ """Calling with same arguments twice should be idempotent."""
+ mixin_instance.project = "test_project"
+ mixin_instance.project_dir = tmp_path / "project"
+
+ master_path = tmp_path / "master" / "ABC123"
+ master_path.mkdir(parents=True)
+ readable_name = "Paper_Name"
+
+ result1 = mixin_instance._create_project_local_symlink(
+ master_path, readable_name
+ )
+ result2 = mixin_instance._create_project_local_symlink(
+ master_path, readable_name
+ )
+
+ # Both calls should return same path
+ assert result1 == result2
+ # Path should exist after both calls
+ assert result1.exists()
+ assert result2.exists()
+ # Should point to same target
+ assert result1.resolve() == result2.resolve()
+
+
+# EOF
diff --git a/tests/scitex/stats/descriptive/test__describe.py b/tests/scitex/stats/descriptive/test__describe.py
old mode 100644
new mode 100755
index 26ed40906..9696ef24a
--- a/tests/scitex/stats/descriptive/test__describe.py
+++ b/tests/scitex/stats/descriptive/test__describe.py
@@ -24,8 +24,9 @@ def test_basic_describe(self):
7,
), f"Expected shape (10, 7), got {described.shape}"
assert len(names) == 7, "Should return 7 stat names"
- assert "nanmean" in names
- assert "nanstd" in names
+ assert "mean" in names
+ assert "std" in names
+ assert "median" in names
def test_with_nans(self):
"""Test with NaN values."""
@@ -63,9 +64,9 @@ def test_all_funcs(self):
described, names = describe(x, dim=-1, funcs="all")
assert len(names) > 7, "Should return all available functions"
- assert "nanmax" in names
- assert "nanmin" in names
- assert "nancount" in names
+ assert "max" in names
+ assert "min" in names
+ assert "count" in names
def test_custom_funcs(self):
"""Test with custom function list."""