diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 9a2693ee2..f269a9854 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -138,6 +138,7 @@ jobs: name: Publish to PyPI needs: [validate, test-install, github-release] runs-on: ubuntu-latest + environment: pypi permissions: id-token: write steps: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 74158b6e9..cfff5f2fd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,5 +1,6 @@ # Timestamp: "2026-02-15 14:00:26 (ywatanabe)" # File: ./.pre-commit-config.yaml +repos: [] # repos: # - repo: local # hooks: diff --git a/scripts/assets/workflow.py b/scripts/assets/workflow.py index 16d1eb649..e76f47029 100755 --- a/scripts/assets/workflow.py +++ b/scripts/assets/workflow.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Timestamp: "2026-02-16 09:09:47 (ywatanabe)" +# Timestamp: "2026-02-16 12:40:23 (ywatanabe)" # File: /home/ywatanabe/proj/scitex-python/scripts/assets/workflow.py @@ -34,16 +34,16 @@ def _add_analysis_boxes(d, C, CT): "Core Modules", subtitle="Infrastructure of All Python Scripts in SciTeX", content=[ + "@scitex.session", + "scitex.plt", + "(figrecipe)", + "scitex.stats", "scitex.io", "scitex.config", "scitex.logger", - "@scitex.session", - "scitex.template", "scitex.repro", "scitex.rng", - "scitex.plt", - "(figrecipe)", - "scitex.stats", + "scitex.template", ], fill_color=CT["blue"], border_color=C["blue"], diff --git a/scripts/assets/workflow_out/workflow.json b/scripts/assets/workflow_out/workflow.json index efb070680..cbff8e5c1 100644 --- a/scripts/assets/workflow_out/workflow.json +++ b/scripts/assets/workflow_out/workflow.json @@ -4,7 +4,7 @@ "meta": { "title": "", "description": "", - "exported_at": "2026-02-16T10:44:22.100200" + "exported_at": "2026-02-16T12:40:50.253456" }, "figure": { "size_px": [ @@ -507,14 +507,14 @@ "id": "ax_00_text_08", "axes_id": "ax_00", "element_type": "text", - "label": "\u00b7 scitex.io", - "text": "\u00b7 scitex.io", + "label": "\u00b7 @scitex.session", + "text": "\u00b7 @scitex.session", "geometry_px": { "coord_space": "axes", "bbox": { - "x0": 3119, + "x0": 3099, "y0": 517, - "x1": 3166, + "x1": 3186, "y1": 527 }, "anchor": { @@ -548,14 +548,14 @@ "id": "ax_00_text_09", "axes_id": "ax_00", "element_type": "text", - "label": "\u00b7 scitex.config", - "text": "\u00b7 scitex.config", + "label": "\u00b7 scitex.plt", + "text": "\u00b7 scitex.plt", "geometry_px": { "coord_space": "axes", "bbox": { - "x0": 3108, + "x0": 3117, "y0": 534, - "x1": 3176, + "x1": 3167, "y1": 544 }, "anchor": { @@ -589,14 +589,14 @@ "id": "ax_00_text_10", "axes_id": "ax_00", "element_type": "text", - "label": "\u00b7 scitex.logger", - "text": "\u00b7 scitex.logger", + "label": "\u00b7 (figrecipe)", + "text": "\u00b7 (figrecipe)", "geometry_px": { "coord_space": "axes", "bbox": { - "x0": 3108, + "x0": 3114, "y0": 551, - "x1": 3177, + "x1": 3170, "y1": 561 }, "anchor": { @@ -630,14 +630,14 @@ "id": "ax_00_text_11", "axes_id": "ax_00", "element_type": "text", - "label": "\u00b7 @scitex.session", - "text": "\u00b7 @scitex.session", + "label": "\u00b7 scitex.stats", + "text": "\u00b7 scitex.stats", "geometry_px": { "coord_space": "axes", "bbox": { - "x0": 3099, + "x0": 3111, "y0": 567, - "x1": 3186, + "x1": 3173, "y1": 577 }, "anchor": { @@ -671,14 +671,14 @@ "id": "ax_00_text_12", "axes_id": "ax_00", "element_type": "text", - "label": "\u00b7 scitex.template", - "text": "\u00b7 scitex.template", + "label": "\u00b7 scitex.io", + "text": "\u00b7 scitex.io", "geometry_px": { "coord_space": "axes", "bbox": { - "x0": 3102, + "x0": 3119, "y0": 584, - "x1": 3183, + "x1": 3166, "y1": 594 }, "anchor": { @@ -712,14 +712,14 @@ "id": "ax_00_text_13", "axes_id": "ax_00", "element_type": "text", - "label": "\u00b7 scitex.repro", - "text": "\u00b7 scitex.repro", + "label": "\u00b7 scitex.config", + "text": "\u00b7 scitex.config", "geometry_px": { "coord_space": "axes", "bbox": { - "x0": 3110, + "x0": 3108, "y0": 601, - "x1": 3174, + "x1": 3176, "y1": 611 }, "anchor": { @@ -753,14 +753,14 @@ "id": "ax_00_text_14", "axes_id": "ax_00", "element_type": "text", - "label": "\u00b7 scitex.rng", - "text": "\u00b7 scitex.rng", + "label": "\u00b7 scitex.logger", + "text": "\u00b7 scitex.logger", "geometry_px": { "coord_space": "axes", "bbox": { - "x0": 3115, + "x0": 3108, "y0": 618, - "x1": 3169, + "x1": 3177, "y1": 628 }, "anchor": { @@ -794,14 +794,14 @@ "id": "ax_00_text_15", "axes_id": "ax_00", "element_type": "text", - "label": "\u00b7 scitex.plt", - "text": "\u00b7 scitex.plt", + "label": "\u00b7 scitex.repro", + "text": "\u00b7 scitex.repro", "geometry_px": { "coord_space": "axes", "bbox": { - "x0": 3117, + "x0": 3110, "y0": 634, - "x1": 3167, + "x1": 3174, "y1": 644 }, "anchor": { @@ -835,14 +835,14 @@ "id": "ax_00_text_16", "axes_id": "ax_00", "element_type": "text", - "label": "\u00b7 (figrecipe)", - "text": "\u00b7 (figrecipe)", + "label": "\u00b7 scitex.rng", + "text": "\u00b7 scitex.rng", "geometry_px": { "coord_space": "axes", "bbox": { - "x0": 3114, + "x0": 3115, "y0": 651, - "x1": 3170, + "x1": 3169, "y1": 661 }, "anchor": { @@ -876,14 +876,14 @@ "id": "ax_00_text_17", "axes_id": "ax_00", "element_type": "text", - "label": "\u00b7 scitex.stats", - "text": "\u00b7 scitex.stats", + "label": "\u00b7 scitex.template", + "text": "\u00b7 scitex.template", "geometry_px": { "coord_space": "axes", "bbox": { - "x0": 3111, + "x0": 3102, "y0": 668, - "x1": 3173, + "x1": 3183, "y1": 678 }, "anchor": { diff --git a/scripts/assets/workflow_out/workflow.png b/scripts/assets/workflow_out/workflow.png index a601664ab..70e73d5c1 100644 Binary files a/scripts/assets/workflow_out/workflow.png and b/scripts/assets/workflow_out/workflow.png differ diff --git a/scripts/assets/workflow_out/workflow.yaml b/scripts/assets/workflow_out/workflow.yaml index 08d409e4c..ade9d1a7e 100644 --- a/scripts/assets/workflow_out/workflow.yaml +++ b/scripts/assets/workflow_out/workflow.yaml @@ -1,6 +1,6 @@ figrecipe: '1.0' type: diagram -created: '2026-02-16T10:44:59.395331' +created: '2026-02-16T12:41:31.895253' dpi: 200 diagram: title: SciTeX Ecosystem @@ -59,16 +59,16 @@ diagram: title: Core Modules subtitle: Infrastructure of All Python Scripts in SciTeX content: + - '@scitex.session' + - scitex.plt + - (figrecipe) + - scitex.stats - scitex.io - scitex.config - scitex.logger - - '@scitex.session' - - scitex.template - scitex.repro - scitex.rng - - scitex.plt - - (figrecipe) - - scitex.stats + - scitex.template emphasis: normal shape: rounded fill_color: !!python/tuple diff --git a/src/scitex/cli/scholar/__init__.py b/src/scitex/cli/scholar/__init__.py index d566967a6..b23ba57be 100755 --- a/src/scitex/cli/scholar/__init__.py +++ b/src/scitex/cli/scholar/__init__.py @@ -34,6 +34,7 @@ from ._crossref_scitex import crossref_scitex from ._fetch import fetch +from ._gui import gui from ._jobs import jobs from ._library import config, library from ._openalex_scitex import openalex_scitex @@ -260,6 +261,7 @@ def list_python_apis(ctx, verbose, max_depth, as_json): scholar.add_command(crossref_scitex) scholar.add_command(openalex_scitex) scholar.add_command(fetch) +scholar.add_command(gui) scholar.add_command(library) scholar.add_command(config) scholar.add_command(jobs) diff --git a/src/scitex/cli/scholar/_gui.py b/src/scitex/cli/scholar/_gui.py new file mode 100755 index 000000000..ce3495f1f --- /dev/null +++ b/src/scitex/cli/scholar/_gui.py @@ -0,0 +1,45 @@ +"""CLI command for launching Scholar GUI.""" + +from __future__ import annotations + +import click + + +@click.command() +@click.option("--port", type=int, default=5051, help="Port to serve on (default: 5051)") +@click.option("--host", default="127.0.0.1", help="Host to bind (default: 127.0.0.1)") +@click.option("--no-browser", is_flag=True, help="Don't open browser automatically") +@click.option("--db", type=click.Path(), default=None, help="CrossRef database path") +def gui(port, host, no_browser, db): + r"""Launch Scholar GUI in browser. + + \b + Interactive web interface for: + - Citation graph visualization + - Paper library management + - Literature search + - Metadata enrichment + + \b + Examples: + scitex scholar gui + scitex scholar gui --port 8080 + scitex scholar gui --db /path/to/crossref.db + """ + try: + from flask import Flask as _ # noqa: F401 + except ImportError: + click.secho("Flask is required: pip install flask", fg="red", err=True) + raise SystemExit(1) + + from scitex.scholar.gui import launch + + launch( + port=port, + host=host, + open_browser=not no_browser, + db_path=db, + ) + + +# EOF diff --git a/src/scitex/scholar/__init__.py b/src/scitex/scholar/__init__.py index f43379103..411672830 100755 --- a/src/scitex/scholar/__init__.py +++ b/src/scitex/scholar/__init__.py @@ -1,400 +1,199 @@ """SciTeX Scholar - Scientific Literature Management Made Simple. -This module provides a unified interface for: -- Searching scientific literature across multiple sources -- Automatic paper enrichment with journal metrics -- PDF downloads and local library management -- Bibliography generation in multiple formats +Searching, enriching, downloading, and organising scientific papers. Quick Start: from scitex.scholar import Scholar scholar = Scholar() papers = scholar.search("deep learning") - papers.save("pac.bib") + papers.save("results.bib") Installation: pip install scitex[scholar] """ -# Check for missing dependencies and warn user (internal) +# ── Internal bootstrap ─────────────────────────────────────────────────────── from scitex._install_guide import warn_module_deps as _warn_module_deps -_missing = _warn_module_deps("scholar") +_warn_module_deps("scholar") -# Import configuration - wrap all in try/except for graceful degradation +# ── Config first (required by Scholar via circular dep) ────────────────────── try: - from scitex.scholar.auth import ScholarAuthManager + from scitex.scholar.config import ScholarConfig except ImportError: - ScholarAuthManager = None # type: ignore[misc,assignment] + ScholarConfig = None # type: ignore[assignment,misc] +# ── Core classes ───────────────────────────────────────────────────────────── try: - from scitex.scholar.browser import ScholarBrowserManager + from scitex.scholar.core import Paper, Papers, Scholar except ImportError: - ScholarBrowserManager = None # type: ignore[misc,assignment] + Paper = None # type: ignore[assignment,misc] + Papers = None # type: ignore[assignment,misc] + Scholar = None # type: ignore[assignment,misc] + +# ── Paper filtering ────────────────────────────────────────────────────────── +# ── Internal helpers (accessible via __getattr__) ──────────────────────────── +from .ensure_workspace import ensure_workspace as _ensure_workspace # noqa: E402 +from .filters import apply_filters # noqa: E402 + +# ── Citation formatting (internal, accessible via __getattr__) ─────────────── +from .formatting import clean_bibtex_for_arxiv as _clean_bibtex_for_arxiv # noqa: E402 +from .formatting import clean_text as _clean_text # noqa: E402 + +# ── Citation formatting (public) ───────────────────────────────────────────── +from .formatting import ( # noqa: E402 + generate_cite_key, + make_citation_key, + papers_to_format, + to_bibtex, + to_endnote, + to_ris, + to_text_citation, +) +from .formatting import ( + paper_from_search_result as _paper_from_search_result, # noqa: E402 +) +from .formatting import paper_normalize as _paper_normalize # noqa: E402 +from .formatting import sanitize_filename as _sanitize_filename # noqa: E402 +from .formatting import to_csv_row as _to_csv_row # noqa: E402 +from .storage import ( + normalize_search_filename as _normalize_search_filename, # noqa: E402 +) +# ── Citation graph ─────────────────────────────────────────────────────────── try: - from scitex.scholar.config import ScholarConfig + from scitex.scholar.citation_graph import ( + CitationGraphBuilder as _CitationGraphBuilder, + ) + from scitex.scholar.citation_graph import ( + plot_citation_graph as _plot_citation_graph, + ) + + CitationGraphBuilder = _CitationGraphBuilder + plot_citation_graph = _plot_citation_graph except ImportError: - ScholarConfig = None # type: ignore[misc,assignment] + CitationGraphBuilder = None # type: ignore[assignment,misc] + plot_citation_graph = None # type: ignore[assignment,misc] +# ── Advanced / power-user classes (hidden, accessible via __getattr__) ─────── try: - from scitex.scholar.core import Paper, Papers, Scholar + from scitex.scholar.auth import ScholarAuthManager as _ScholarAuthManager except ImportError: - Paper = None - Papers = None - Scholar = None + _ScholarAuthManager = None # type: ignore[assignment] try: - from scitex.scholar.metadata_engines import ScholarEngine + from scitex.scholar.browser import ScholarBrowserManager as _ScholarBrowserManager except ImportError: - ScholarEngine = None + _ScholarBrowserManager = None # type: ignore[assignment] try: - from scitex.scholar.pdf_download import ScholarPDFDownloader + from scitex.scholar.metadata_engines import ScholarEngine as _ScholarEngine except ImportError: - ScholarPDFDownloader = None + _ScholarEngine = None # type: ignore[assignment] try: - from scitex.scholar.storage import ScholarLibrary + from scitex.scholar.pdf_download import ( + ScholarPDFDownloader as _ScholarPDFDownloader, + ) except ImportError: - ScholarLibrary = None + _ScholarPDFDownloader = None # type: ignore[assignment] try: - from scitex.scholar.url_finder import ScholarURLFinder + from scitex.scholar.storage import ScholarLibrary as _ScholarLibrary except ImportError: - ScholarURLFinder = None + _ScholarLibrary = None # type: ignore[assignment] try: - from . import _utils # Internal utilities + from scitex.scholar.url_finder import ScholarURLFinder as _ScholarURLFinder except ImportError: - _utils = None + _ScholarURLFinder = None # type: ignore[assignment] -# Local database integrations (crossref-local, openalex-local) +# Local database integrations (available if crossref-local / openalex-local installed) try: - from .local_dbs import crossref_scitex + from .local_dbs import crossref_scitex as _crossref_scitex except ImportError: - crossref_scitex = None + _crossref_scitex = None # type: ignore[assignment] try: - from .local_dbs import openalex_scitex + from .local_dbs import openalex_scitex as _openalex_scitex except ImportError: - openalex_scitex = None + _openalex_scitex = None # type: ignore[assignment] -# Workspace ensure -from .ensure_workspace import ensure_workspace # noqa: E402 +# ── Hide leaked submodule attributes ───────────────────────────────────────── +import sys as _sys -# Citation formatting (plain-dict based, no ORM dependencies) -from .formatting import ( # noqa: E402 - clean_bibtex_for_arxiv, - clean_text, - generate_cite_key, - paper_normalize, - papers_to_format, - to_bibtex, - to_csv_row, - to_endnote, - to_ris, - to_text_citation, -) - -__all__ = [ - # Workspace +_this_module = _sys.modules[__name__] +for _submod in [ + "auth", + "browser", + "config", + "core", "ensure_workspace", + "filters", + "formatting", + "impact_factor", + "local_dbs", + "metadata_engines", + "pdf_download", + "storage", + "url_finder", + "citation_graph", + "_utils", +]: + try: + delattr(_this_module, _submod) + except AttributeError: + pass +del _this_module, _submod, _sys + +# ── Lazy access for hidden names (backward compat for internal imports) ────── +_LAZY_NAMES = { + # Power-user classes + "ScholarAuthManager": "_ScholarAuthManager", + "ScholarBrowserManager": "_ScholarBrowserManager", + "ScholarEngine": "_ScholarEngine", + "ScholarPDFDownloader": "_ScholarPDFDownloader", + "ScholarLibrary": "_ScholarLibrary", + "ScholarURLFinder": "_ScholarURLFinder", + # Internal helpers + "ensure_workspace": "_ensure_workspace", + "normalize_search_filename": "_normalize_search_filename", + "clean_bibtex_for_arxiv": "_clean_bibtex_for_arxiv", + "clean_text": "_clean_text", + "paper_normalize": "_paper_normalize", + "paper_from_search_result": "_paper_from_search_result", + "sanitize_filename": "_sanitize_filename", + "to_csv_row": "_to_csv_row", +} + + +def __getattr__(name): # noqa: C901 + if name in _LAZY_NAMES: + return globals()[_LAZY_NAMES[name]] + raise AttributeError(f"module 'scitex.scholar' has no attribute {name!r}") + + +# ── Public API ──────────────────────────────────────────────────────────────── +__all__ = [ # Core classes "Scholar", "Paper", "Papers", - # Configuration and managers "ScholarConfig", - "ScholarEngine", - "ScholarURLFinder", - "ScholarAuthManager", - "ScholarBrowserManager", - "ScholarLibrary", - "ScholarPDFDownloader", - # Local database integrations - "crossref_scitex", # CrossRef (167M+ papers via crossref-local) - "openalex_scitex", # OpenAlex (284M+ works via openalex-local) - # Citation formatting - "clean_text", - "generate_cite_key", - "paper_normalize", + # Citation graph + "CitationGraphBuilder", + "plot_citation_graph", + # Formatting (user-facing) "to_bibtex", "to_ris", "to_endnote", - "to_csv_row", "to_text_citation", - "clean_bibtex_for_arxiv", "papers_to_format", + "generate_cite_key", + "make_citation_key", + # Filtering + "apply_filters", ] -# # Import core classes for advanced users -# from scitex.scholar.core import Paper -# from .core.Papers import Papers - -# # DOI resolver is available via: python -m scitex.scholar.resolve_doi_asyncs -# from . import doi - -# # Backward compatibility alias -# PaperCollection = Papers - -# # Import utility functions -# from .utils._formatters import ( -# papers_to_bibtex, -# papers_to_ris, -# papers_to_json, -# papers_to_markdown -# ) - -# # Import enrichment functionality -# from .metadata.enrichment._MetadataEnricher import ( -# MetadataEnricher, -# _enrich_papers_with_all, -# _enrich_papers_with_impact_factors, -# _enrich_papers_with_citations, -# ) - -# # PDF download functionality -# from .download._ScholarPDFDownloader import ( -# ScholarPDFDownloader, -# download_pdf_async, -# download_pdf_asyncs_async, -# ) -# from .download._SmartScholarPDFDownloader import SmartScholarPDFDownloader - -# # Browser-based download functionality removed - simplified structure - -# # Create module-level convenience function -# def download_pdf_asyncs( -# dois, -# download_dir=None, -# force=False, -# max_worker=4, -# show_async_progress=True, -# acknowledge_ethical_usage=None, -# **kwargs -# ): -# """ -# Download PDFs for DOIs using default Scholar instance. - -# This is a convenience function that creates a Scholar instance if needed. -# For more control, use Scholar().download_pdf_asyncs() directly. - -# Args: -# dois: DOI strings (list or single string) or Papers/Paper objects -# download_dir: Directory to save PDFs -# force: Force re-download -# max_worker: Maximum concurrent downloads -# show_async_progress: Show download progress -# acknowledge_ethical_usage: Acknowledge ethical usage for Sci-Hub -# **kwargs: Additional arguments - -# Returns: -# Dictionary with download results - -# Examples: -# >>> import scitex as stx -# >>> stx.scholar.download_pdf_asyncs(["10.1234/doi1", "10.5678/doi2"]) -# >>> stx.scholar.download_pdf_asyncs("10.1234/single-doi") -# """ -# scholar = Scholar() -# return scholar.download_pdf_asyncs( -# dois, -# download_dir=download_dir, -# force=force, -# max_worker=max_worker, -# show_async_progress=show_async_progress, -# acknowledge_ethical_usage=acknowledge_ethical_usage, -# **kwargs -# ) - -# # Version -# __version__ = "0.1.0" - -# # What users see with "from scitex.scholar import *" -# __all__ = [ -# # Main interface -# 'Scholar', -# 'ScholarConfig', - - -# # Convenience functions -# 'search', -# 'search_quick', -# 'enrich_bibtex', -# 'download_pdf_asyncs', # NEW: Module-level convenience function - -# "doi", -# "resolve_doi_asyncs", - -# # Core classes -# 'Paper', -# 'Papers', -# 'PaperCollection', # Backward compatibility alias - -# # Format converters -# 'papers_to_bibtex', -# 'papers_to_ris', -# 'papers_to_json', -# 'papers_to_markdown', - -# # Enrichment -# 'MetadataEnricher', - -# # PDF download functionality -# 'ScholarPDFDownloader', -# 'download_pdf_async', -# 'download_pdf_asyncs_async', - -# # Browser-based functionality - -# # Authentication -# 'ScholarAuthManager', -# # 'OpenAthensAuthenticator', -# # 'ShibbolethAuthenticator', -# # 'EZProxyAuthenticator', - -# # Resolution -# 'SingleDOIResolver', -# 'OpenURLResolver', -# 'ResumableOpenURLResolver', -# # 'BatchDOIResolver', - -# # Enrichment -# 'MetadataEnricher', -# 'JCR_YEAR', - -# # Validation -# 'PDFValidator', -# 'ValidationResult', - -# # # Database -# # 'PaperDatabase', -# # 'DatabaseEntry', -# # 'DatabaseIndex', - -# # Semantic Search -# # 'SemanticSearchEngine', -# # 'VectorDatabase', -# # 'Embedder', -# ] - -# # # For backward compatibility, provide access to old functions with deprecation warnings -# # def __getattr__(name): -# # """Provide backward compatibility with deprecation warnings.""" -# # import warnings - -# # # Handle special IPython attributes -# # if name in ['__custom_documentations__', '__wrapped__']: -# # raise AttributeError(f"module '{__name__}' has no attribute '{name}'") - -# # # Map old names to new functionality -# # compatibility_map = { -# # 'search_sync': 'search', -# # 'build_index': 'Scholar()._index_local_pdfs', -# # 'get_scholar_dir': 'Scholar().get_workspace_dir()', -# # 'LocalSearchEngine': 'Scholar', -# # 'VectorSearchEngine': 'Scholar', -# # 'ScholarPDFDownloader': 'Scholar', -# # 'search_papers': 'search', -# # 'SemanticScholarPaper': 'Paper', -# # 'PaperMetadata': 'Paper', -# # 'PaperAcquisition': 'Scholar', -# # 'SemanticScholarClient': 'Scholar', -# # 'JournalMetrics': 'Scholar', -# # 'PaperEnrichmentService': 'Scholar', -# # 'generate_enriched_bibliography': 'Papers.save' -# # } - -# # if name in compatibility_map: -# # warnings.warn( -# # f"{name} is deprecated. Use {compatibility_map[name]} instead.", -# # DeprecationWarning, -# # stacklevel=2 -# # ) - -# # # Return the Scholar class for most cases -# # if name in ['search_sync', 'search_papers']: -# # return search -# # elif name == 'build_index': -# # def build_index(paths, **kwargs): -# # scholar = Scholar() -# # stats = {} -# # for path in paths: -# # stats.update(scholar._index_local_pdfs(path)) -# # return stats -# # return build_index -# # else: -# # return Scholar - -# # from scitex.logging import ScholarError -# # raise ScholarError( -# # f"Module attribute not found: '{name}'", -# # context={"module": __name__, "attribute": name}, -# # suggestion=f"Available attributes: Scholar, Paper, Papers, search, enrich_bibtex" -# # ) - - -# # Import new modules -# from .auth import ( -# ScholarAuthManager, -# # OpenAthensAuthenticator, -# # ShibbolethAuthenticator, -# # EZProxyAuthenticator, -# ) -# from .metadata.doi._SingleDOIResovler import SingleDOIResolver -# from .open_url import OpenURLResolver, ResumableOpenURLResolver -# from .metadata.enrichment import ( -# MetadataEnricher, -# JCR_YEAR, -# ) -# # from .cli import resolve_doi_asyncs -# from .validation import PDFValidator, ValidationResult -# # from .database import PaperDatabase, DatabaseEntry, DatabaseIndex -# # from .search import SemanticSearchEngine, VectorDatabase, Embedder - -# # Module docstring for help() -# def _module_docstring(): -# """ -# SciTeX Scholar - Scientific Literature Management - -# Main Classes: -# Scholar: Main interface for all functionality -# Paper: Represents a scientific paper -# Papers: Collection of papers with analysis tools - -# Quick Start: -# >>> from scitex.scholar import Scholar -# >>> scholar = Scholar() -# >>> papers = scholar.search("machine learning") -# >>> papers.filter(year_min=2020).save("ml_pac.bib") - -# Common Workflows: -# # Search and enrich -# papers = scholar.search("deep learning", year_min=2022) - -# # Download PDFs -# scholar.download_pdf_asyncs(papers) - -# # Filter results -# high_impact = papers.filter(impact_factor_min=5.0) - -# # Save bibliography -# papers.save("bibliography.bib", format="bibtex") - -# # Search local library -# scholar._index_local_pdfs("./my_papers") -# local = scholar.search_local("transformer") - -# For more information, see the documentation at: -# https://github.com/ywatanabe1989/SciTeX-Code -# """ -# pass - -# # Set module docstring -# __doc__ = _module_docstring.__doc__ - -# # EOF - # EOF diff --git a/src/scitex/scholar/citation_graph/__init__.py b/src/scitex/scholar/citation_graph/__init__.py index 299fc15c0..876515d2d 100755 --- a/src/scitex/scholar/citation_graph/__init__.py +++ b/src/scitex/scholar/citation_graph/__init__.py @@ -19,10 +19,13 @@ from .builder import CitationGraphBuilder from .models import CitationEdge, CitationGraph, PaperNode +from .visualization import list_backends, plot_citation_graph __all__ = [ "CitationGraphBuilder", "PaperNode", "CitationEdge", "CitationGraph", + "plot_citation_graph", + "list_backends", ] diff --git a/src/scitex/scholar/citation_graph/models.py b/src/scitex/scholar/citation_graph/models.py index b3fe2e2b1..1674a5082 100755 --- a/src/scitex/scholar/citation_graph/models.py +++ b/src/scitex/scholar/citation_graph/models.py @@ -3,7 +3,7 @@ """ from dataclasses import dataclass, field -from typing import Dict, List, Optional +from typing import Dict, List @dataclass @@ -78,3 +78,34 @@ def node_count(self) -> int: def edge_count(self) -> int: """Number of edges in graph.""" return len(self.edges) + + def to_networkx(self): + """Convert to NetworkX DiGraph with node attributes. + + Returns + ------- + networkx.DiGraph + Directed graph with node attributes: title, short_title, + year, citations, similarity, journal. + """ + import networkx as nx + + G = nx.DiGraph() + for node in self.nodes: + G.add_node( + node.doi, + title=node.title, + short_title=node.title[:30], + year=node.year, + citations=node.citation_count, + similarity=node.similarity_score, + journal=node.journal, + ) + for edge in self.edges: + G.add_edge( + edge.source, + edge.target, + edge_type=edge.edge_type, + weight=edge.weight, + ) + return G diff --git a/src/scitex/scholar/citation_graph/visualization.py b/src/scitex/scholar/citation_graph/visualization.py new file mode 100755 index 000000000..04d8ae98b --- /dev/null +++ b/src/scitex/scholar/citation_graph/visualization.py @@ -0,0 +1,265 @@ +"""Pluggable visualization for citation graphs. + +Supports multiple rendering backends with automatic fallback: + figrecipe > scitex.plt > matplotlib > pyvis + +Example +------- + >>> from scitex.scholar.citation_graph import CitationGraphBuilder, plot_citation_graph + >>> builder = CitationGraphBuilder("/path/to/crossref.db") + >>> graph = builder.build("10.1038/s41586-020-2008-3", top_n=20) + >>> fig = plot_citation_graph(graph) # auto backend + >>> fig = plot_citation_graph(graph, backend="pyvis", output="network.html") +""" + +from typing import Any, Dict, Optional + +# ── Backend availability flags ─────────────────────────────────────────────── + +try: + from figrecipe._graph import draw_graph as _fr_draw_graph + from figrecipe._graph._presets import get_preset as _fr_get_preset + + _FIGRECIPE_AVAILABLE = True +except ImportError: + _FIGRECIPE_AVAILABLE = False + +try: + from scitex.plt._figrecipe_integration import draw_graph as _stx_draw_graph + + _SCITEX_PLT_AVAILABLE = True +except ImportError: + _SCITEX_PLT_AVAILABLE = False + +try: + from pyvis.network import Network as _PyvisNetwork + + _PYVIS_AVAILABLE = True +except ImportError: + _PYVIS_AVAILABLE = False + +_MATPLOTLIB_AVAILABLE = True # always available (core dependency) + +# Backend resolution order +_BACKEND_PRIORITY = ["figrecipe", "scitex.plt", "matplotlib", "pyvis"] + + +def list_backends() -> Dict[str, bool]: + """List available visualization backends. + + Returns + ------- + dict + Mapping of backend name to availability. + """ + return { + "figrecipe": _FIGRECIPE_AVAILABLE, + "scitex.plt": _SCITEX_PLT_AVAILABLE, + "matplotlib": _MATPLOTLIB_AVAILABLE, + "pyvis": _PYVIS_AVAILABLE, + } + + +def _resolve_backend(backend: str) -> str: + """Resolve 'auto' to the best available backend.""" + if backend != "auto": + available = list_backends() + if backend not in available: + raise ValueError( + f"Unknown backend '{backend}'. Available: {list(available.keys())}" + ) + if not available[backend]: + raise ImportError( + f"Backend '{backend}' is not available. " + f"Available backends: " + f"{[k for k, v in available.items() if v]}" + ) + return backend + + for name in _BACKEND_PRIORITY: + if list_backends()[name]: + return name + + return "matplotlib" # fallback (always available) + + +# ── Backend implementations ────────────────────────────────────────────────── + + +def _plot_figrecipe(G, output=None, **kwargs): + """Render with figrecipe (publication-quality static).""" + import matplotlib.pyplot as plt + + preset = _fr_get_preset("citation") + merged = {**preset, **kwargs} + + fig, ax = plt.subplots(1, 1, figsize=kwargs.pop("figsize", (8, 6))) + result = _fr_draw_graph(ax, G, **merged) + + if output: + fig.savefig(output, dpi=kwargs.get("dpi", 150), bbox_inches="tight") + + return {"fig": fig, "ax": ax, "pos": result["pos"], "backend": "figrecipe"} + + +def _plot_scitex_plt(G, output=None, **kwargs): + """Render with scitex.plt (AxisWrapper + CSV auto-export).""" + import scitex.plt as stx_plt + + preset = _fr_get_preset("citation") if _FIGRECIPE_AVAILABLE else {} + merged = {**preset, **kwargs} + + fig, ax = stx_plt.subplots() + result = _stx_draw_graph(ax, G, **merged) + + if output: + import scitex.io + + scitex.io.save(fig, output) + + return {"fig": fig, "ax": ax, "pos": result["pos"], "backend": "scitex.plt"} + + +def _plot_matplotlib(G, output=None, **kwargs): # noqa: C901 + """Render with raw matplotlib + networkx (no external deps).""" + import matplotlib.pyplot as plt + import networkx as nx + + fig, ax = plt.subplots(1, 1, figsize=kwargs.pop("figsize", (8, 6))) + + layout = kwargs.pop("layout", "spring") + seed = kwargs.pop("seed", 42) + + # Compute layout + layout_funcs = { + "spring": lambda g: nx.spring_layout(g, seed=seed), + "circular": nx.circular_layout, + "kamada_kawai": nx.kamada_kawai_layout, + "shell": nx.shell_layout, + "spectral": nx.spectral_layout, + } + layout_fn = layout_funcs.get(layout, layout_funcs["spring"]) + pos = layout_fn(G) + + # Node sizing by citations + citations = [G.nodes[n].get("citations", 1) for n in G.nodes()] + max_c = max(citations) if citations else 1 + sizes = [50 + (c / max(max_c, 1)) * 250 for c in citations] + + # Node coloring by year + years = [G.nodes[n].get("year", 0) for n in G.nodes()] + + # Draw + nx.draw_networkx_edges(G, pos, alpha=0.3, ax=ax) + nx.draw_networkx_nodes( + G, + pos, + node_size=sizes, + node_color=years if any(years) else "#3498db", + cmap=plt.cm.viridis if any(years) else None, + alpha=0.8, + ax=ax, + ) + + # Labels: short titles + labels = {n: G.nodes[n].get("short_title", n)[:20] for n in G.nodes()} + nx.draw_networkx_labels(G, pos, labels=labels, font_size=5, ax=ax) + + ax.axis("off") + + if output: + fig.savefig(output, dpi=kwargs.get("dpi", 150), bbox_inches="tight") + + return {"fig": fig, "ax": ax, "pos": pos, "backend": "matplotlib"} + + +def _plot_pyvis(G, output=None, **kwargs): + """Render as interactive HTML with pyvis.""" + if output is None: + raise ValueError("pyvis backend requires output path (HTML file)") + + net = _PyvisNetwork( + height="750px", + width="100%", + bgcolor="#ffffff", + font_color="black", + ) + net.barnes_hut() + + for node_id in G.nodes(): + data = G.nodes[node_id] + title = data.get("title", str(node_id)) + citations = data.get("citations", 0) + year = data.get("year", "?") + size = 10 + min(citations, 500) ** 0.5 * 2 + + net.add_node( + node_id, + label=f"{title[:40]}...\n({year})", + title=f"{title}\n{node_id}\nCitations: {citations}", + size=size, + color="#3498db" if citations > 50 else "#95a5a6", + ) + + for u, v in G.edges(): + net.add_edge(u, v) + + net.save_graph(str(output)) + return {"output": str(output), "backend": "pyvis"} + + +_BACKEND_DISPATCH = { + "figrecipe": _plot_figrecipe, + "scitex.plt": _plot_scitex_plt, + "matplotlib": _plot_matplotlib, + "pyvis": _plot_pyvis, +} + + +# ── Public API ─────────────────────────────────────────────────────────────── + + +def plot_citation_graph( + graph, + backend: str = "auto", + output: Optional[str] = None, + **kwargs, +) -> Dict[str, Any]: + """Visualize a citation graph with pluggable backends. + + Parameters + ---------- + graph : CitationGraph or networkx.DiGraph + Citation network to visualize. CitationGraph is auto-converted + via ``to_networkx()``. + backend : str + Rendering backend: 'auto', 'figrecipe', 'scitex.plt', + 'matplotlib', or 'pyvis'. Default 'auto' picks the best available. + output : str, optional + Output file path. Required for 'pyvis' backend (HTML). + For static backends, saves the figure to this path. + **kwargs + Backend-specific keyword arguments (layout, seed, figsize, etc.). + + Returns + ------- + dict + Backend-specific result. Static backends return + ``{'fig', 'ax', 'pos', 'backend'}``. + Pyvis returns ``{'output', 'backend'}``. + """ + from .models import CitationGraph + + # Convert CitationGraph to NetworkX if needed + if isinstance(graph, CitationGraph): + G = graph.to_networkx() + else: + G = graph + + resolved = _resolve_backend(backend) + return _BACKEND_DISPATCH[resolved](G, output=output, **kwargs) + + +__all__ = ["plot_citation_graph", "list_backends"] + +# EOF diff --git a/src/scitex/scholar/filters.py b/src/scitex/scholar/filters.py new file mode 100755 index 000000000..3256b6aeb --- /dev/null +++ b/src/scitex/scholar/filters.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python3 +# File: ./src/scitex/scholar/filters.py + +""" +Pure-function paper filtering for scitex.scholar. + +Works on plain dicts only — no Django ORM or model imports required. + +Expected paper dict keys: + title : str + authors : list[str] + journal : str + year : int or str + citations : int or str + impact_factor : float or str or None + is_open_access: bool + source : str + snippet : str (optional, used for doc_type detection) +""" + +from typing import Any, Dict, List, Optional + + +def apply_filters( + papers: List[Dict[str, Any]], + filters: Optional[Dict[str, Any]] = None, + parsed_operators: Optional[Dict[str, Any]] = None, +) -> List[Dict[str, Any]]: + """Filter a list of paper dicts by various criteria. + + Args: + papers: List of paper dicts. Each dict should contain the keys + described in the module docstring; missing keys are treated as + empty / zero values. + filters: Dict of filter criteria extracted from a search form or URL + parameters. Supported keys: + year_from, year_to – year range (int) + min_citations, max_citations – citation range (int) + min_impact_factor – minimum IF (float) + max_impact_factor – maximum IF (float) + authors – list of author name strings (legacy) + journal – journal name substring (legacy, str) + open_access – bool + doc_type – "review" | "preprint" | other + language – language string ("english" passes) + parsed_operators: Dict produced by + SearchQueryParser.from_shell_syntax() or the equivalent + parse_query_operators() function from scitex-cloud. Supported + keys: + title_includes, title_excludes – list[str] + author_includes, author_excludes – list[str] + journal_includes, journal_excludes – list[str] + year_min, year_max – int + citations_min, citations_max – int + impact_factor_min, impact_factor_max – float + + Returns + ------- + Filtered list of paper dicts (same objects, not copies). + """ + if not filters and not parsed_operators: + return papers + + filtered: List[Dict[str, Any]] = [] + + for paper in papers: + # ------------------------------------------------------------------ + # Title includes / excludes (from parsed_operators) + # ------------------------------------------------------------------ + if parsed_operators: + title = paper.get("title", "").lower() + + if parsed_operators.get("title_includes"): + if not all( + term.lower() in title for term in parsed_operators["title_includes"] + ): + continue + + if parsed_operators.get("title_excludes"): + if any( + term.lower() in title for term in parsed_operators["title_excludes"] + ): + continue + + # ------------------------------------------------------------------ + # Author includes / excludes (from parsed_operators) + # ------------------------------------------------------------------ + authors_text = " ".join(paper.get("authors", [])).lower() + + if parsed_operators.get("author_includes"): + if not all( + term.lower() in authors_text + for term in parsed_operators["author_includes"] + ): + continue + + if parsed_operators.get("author_excludes"): + if any( + term.lower() in authors_text + for term in parsed_operators["author_excludes"] + ): + continue + + # ------------------------------------------------------------------ + # Journal includes / excludes (from parsed_operators) + # ------------------------------------------------------------------ + journal_name = paper.get("journal", "").lower() + + if parsed_operators.get("journal_includes"): + if not all( + term.lower() in journal_name + for term in parsed_operators["journal_includes"] + ): + continue + + if parsed_operators.get("journal_excludes"): + if any( + term.lower() in journal_name + for term in parsed_operators["journal_excludes"] + ): + continue + + # ------------------------------------------------------------------ + # Year range (filters take precedence, parsed_operators can override) + # ------------------------------------------------------------------ + year_from = filters.get("year_from") if filters else None + year_to = filters.get("year_to") if filters else None + + if parsed_operators: + year_from = parsed_operators.get("year_min") or year_from + year_to = parsed_operators.get("year_max") or year_to + + if year_from is not None or year_to is not None: + try: + year = int(paper.get("year", 0)) + if year_from is not None and year < year_from: + continue + if year_to is not None and year > year_to: + continue + except (ValueError, TypeError): + continue + + # ------------------------------------------------------------------ + # Citation count + # ------------------------------------------------------------------ + min_citations = filters.get("min_citations") if filters else None + max_citations = filters.get("max_citations") if filters else None + + if parsed_operators: + min_citations = parsed_operators.get("citations_min") or min_citations + max_citations = parsed_operators.get("citations_max") or max_citations + + if min_citations is not None or max_citations is not None: + try: + citations = int(paper.get("citations", 0)) + if min_citations is not None and citations < min_citations: + continue + if max_citations is not None and citations > max_citations: + continue + except (ValueError, TypeError): + continue + + # ------------------------------------------------------------------ + # Impact factor + # ------------------------------------------------------------------ + min_if = filters.get("min_impact_factor") if filters else None + max_if = filters.get("max_impact_factor") if filters else None + + if parsed_operators: + min_if = parsed_operators.get("impact_factor_min") or min_if + max_if = parsed_operators.get("impact_factor_max") or max_if + + if min_if is not None or max_if is not None: + try: + impact_factor = float(paper.get("impact_factor", 0) or 0) + if min_if is not None and impact_factor < min_if: + continue + if max_if is not None and impact_factor > max_if: + continue + except (ValueError, TypeError): + continue + + # ------------------------------------------------------------------ + # Legacy author filter (filters["authors"] is a list of name strings) + # ------------------------------------------------------------------ + if filters and filters.get("authors"): + authors_text = " ".join(paper.get("authors", [])).lower() + if not any(name.lower() in authors_text for name in filters["authors"]): + continue + + # ------------------------------------------------------------------ + # Legacy journal filter (filters["journal"] is a substring) + # ------------------------------------------------------------------ + if filters and filters.get("journal"): + journal_name = paper.get("journal", "").lower() + if filters["journal"].lower() not in journal_name: + continue + + # ------------------------------------------------------------------ + # Open access + # ------------------------------------------------------------------ + if filters and filters.get("open_access") and not paper.get("is_open_access"): + continue + + # ------------------------------------------------------------------ + # Document type (basic heuristic) + # ------------------------------------------------------------------ + if filters and filters.get("doc_type"): + title_and_snippet = ( + paper.get("title", "") + " " + paper.get("snippet", "") + ).lower() + doc_type = filters["doc_type"].lower() + + if doc_type == "review" and "review" not in title_and_snippet: + continue + elif ( + doc_type == "preprint" + and "preprint" not in paper.get("source", "").lower() + ): + continue + + # ------------------------------------------------------------------ + # Language (basic: only "english" passes) + # ------------------------------------------------------------------ + if filters and filters.get("language"): + if filters["language"].lower() != "english": + continue + + filtered.append(paper) + + return filtered + + +# EOF diff --git a/src/scitex/scholar/formatting.py b/src/scitex/scholar/formatting.py index 665c784ec..e34c309bb 100755 --- a/src/scitex/scholar/formatting.py +++ b/src/scitex/scholar/formatting.py @@ -377,12 +377,80 @@ def papers_to_format(papers: List[dict], fmt: str) -> str: return "\n\n".join(func(p) for p in papers) +# ── Search result normalization ───────────────────────────────── + + +def paper_from_search_result(result: dict) -> dict: + """Normalize a raw search-API result dict to the standard paper format. + + Handles field aliases from different search engines (externalUrl, snippet, etc.) + and fills missing fields with safe defaults. + """ + journal = result.get("journal") or "" + import re as _re + + journal = _re.sub(r"\s*\(IF[^)]*\)", "", journal) + return { + "title": result.get("title") or "Unknown", + "authors": result.get("authors") or "", + "journal": journal, + "year": str(result.get("year") or ""), + "doi": result.get("doi") or result.get("DOI") or "", + "pmid": result.get("pmid") or "", + "arxiv_id": result.get("arxiv_id") or "", + "citations": result.get("citations") or result.get("citation_count") or 0, + "impact_factor": result.get("impact_factor") or 0, + "is_open_access": result.get("is_open_access", False), + "abstract": result.get("abstract") or result.get("snippet") or "", + "url": ( + result.get("externalUrl") + or result.get("external_url") + or result.get("pdf_url") + or "" + ), + "source": result.get("source") or "unknown", + } + + +def make_citation_key(last_name: str, year=None) -> str: + """Generate a citation key from author last name and year. + + Args: + last_name: Author last name (special chars stripped). + year: Publication year (optional). + + Returns + ------- + Citation key string, e.g. ``smith2024``. + """ + import re as _re + + name = _re.sub(r"[^a-zA-Z]", "", last_name).lower() + return f"{name}{year}" if year else name + + +def sanitize_filename(filename: str, max_length: int = 50) -> str: + """Sanitize a string for use as a download filename. + + Replaces shell-unsafe characters with underscores, collapses whitespace, + and truncates to *max_length* characters. + """ + import re as _re + + filename = _re.sub(r'[<>:"/\\|?*]', "_", filename) + filename = filename[:max_length] + return _re.sub(r"\s+", "_", filename.strip()) + + # ── Public API ────────────────────────────────────────────────── __all__ = [ "clean_text", "generate_cite_key", "paper_normalize", + "paper_from_search_result", + "make_citation_key", + "sanitize_filename", "to_bibtex", "to_ris", "to_endnote", diff --git a/src/scitex/scholar/gui/__init__.py b/src/scitex/scholar/gui/__init__.py new file mode 100755 index 000000000..c0e08ce52 --- /dev/null +++ b/src/scitex/scholar/gui/__init__.py @@ -0,0 +1,54 @@ +"""Scholar GUI - Interactive Flask app for scientific literature management. + +Launch via CLI: + scitex scholar gui + scitex scholar gui --port 8080 + +Or from Python: + from scitex.scholar.gui import launch + launch(port=5051) +""" + +from pathlib import Path +from typing import Optional + + +def launch( + port: int = 5051, + host: str = "127.0.0.1", + open_browser: bool = True, + db_path: Optional[str] = None, +): + """Launch the Scholar GUI in a browser. + + Parameters + ---------- + port : int + Port to serve on (default: 5051). + host : str + Host to bind to (default: 127.0.0.1). + open_browser : bool + Whether to open a browser tab automatically. + db_path : str, optional + Path to CrossRef SQLite database. Auto-detected if not given. + """ + from ._app import create_app + + app = create_app(db_path=db_path) + url = f"http://{host}:{port}" + + if open_browser: + import threading + import webbrowser + + threading.Timer(1.0, webbrowser.open, args=[url]).start() + + print(f"Scholar GUI running at {url}") + print("Press Ctrl+C to stop.") + + app.run(host=host, port=port, debug=False, use_reloader=False, threaded=True) + + +__all__ = ["launch"] + +# EOF diff --git a/src/scitex/scholar/gui/_app.py b/src/scitex/scholar/gui/_app.py new file mode 100755 index 000000000..dc24be9af --- /dev/null +++ b/src/scitex/scholar/gui/_app.py @@ -0,0 +1,91 @@ +"""Flask app factory for Scholar GUI.""" + +from pathlib import Path +from typing import Optional + +from flask import Flask + + +def _find_crossref_db(db_path: Optional[str] = None) -> Optional[str]: + """Auto-detect CrossRef database path.""" + if db_path and Path(db_path).exists(): + return db_path + + # Static candidates + candidates = [ + Path.home() / ".scitex" / "scholar" / "crossref.db", + Path.home() / "proj" / "crossref_local" / "data" / "crossref.db", + Path.home() / "proj" / "crossref-local" / "data" / "crossref.db", + Path.home() / ".proj" / "crossref_local" / "data" / "crossref.db", + ] + for p in candidates: + if p.exists(): + return str(p) + + # Try crossref_local module info as last resort + try: + import crossref_local + + info = crossref_local.info() + p = info.get("db_path") + if p and Path(p).exists(): + return str(p) + except Exception: + pass + + return None + + +def create_app(db_path: Optional[str] = None) -> Flask: + """Create and configure the Flask application. + + Parameters + ---------- + db_path : str, optional + Path to CrossRef database. Auto-detected if not given. + """ + static_dir = Path(__file__).parent / "static" + template_dir = Path(__file__).parent / "templates" + + app = Flask( + __name__, + static_folder=str(static_dir), + static_url_path="/static", + template_folder=str(template_dir), + ) + + # Store DB path in app config + resolved_db = _find_crossref_db(db_path) + app.config["CROSSREF_DB_PATH"] = resolved_db + + # Register routes + from ._routes_graph import register_graph_routes + + register_graph_routes(app) + + @app.route("/") + def index(): + from flask import render_template + + return render_template( + "scholar.html", + db_available=resolved_db is not None, + db_path=resolved_db or "Not found", + ) + + @app.route("/api/health") + def health(): + from flask import jsonify + + return jsonify( + { + "status": "ok", + "db_available": resolved_db is not None, + "db_path": resolved_db, + } + ) + + return app + + +# EOF diff --git a/src/scitex/scholar/gui/_routes_graph.py b/src/scitex/scholar/gui/_routes_graph.py new file mode 100755 index 000000000..e285a460d --- /dev/null +++ b/src/scitex/scholar/gui/_routes_graph.py @@ -0,0 +1,206 @@ +"""Citation graph API routes for Scholar GUI. + +Ported from scitex-cloud/apps/scholar_app/api/citation_graph.py +(Django REST → Flask). +""" + +import hashlib +import logging +import time +from typing import Dict, Optional + +from flask import Flask, current_app, jsonify, request + +logger = logging.getLogger(__name__) + +# Simple in-memory cache (no Django cache dependency) +_cache: Dict[str, dict] = {} +_cache_timestamps: Dict[str, float] = {} +_CACHE_TTL = 3600 # 1 hour + + +def _cache_get(key: str) -> Optional[dict]: + """Get value from cache if not expired.""" + if key in _cache: + if time.time() - _cache_timestamps.get(key, 0) < _CACHE_TTL: + return _cache[key] + del _cache[key] + del _cache_timestamps[key] + return None + + +def _cache_set(key: str, value: dict, ttl: int = _CACHE_TTL): + """Set value in cache.""" + _cache[key] = value + _cache_timestamps[key] = time.time() + + +def _make_cache_key(prefix: str, doi: str, **kwargs) -> str: + """Create cache key from parameters.""" + parts = [prefix, doi.lower()] + for k, v in sorted(kwargs.items()): + parts.append(f"{k}={v}") + return f"cg:{hashlib.md5(':'.join(parts).encode()).hexdigest()}" + + +def _get_builder(): + """Get or create CitationGraphBuilder from app config.""" + db_path = current_app.config.get("CROSSREF_DB_PATH") + if not db_path: + return None + + from scitex.scholar.citation_graph import CitationGraphBuilder + + return CitationGraphBuilder(db_path) + + +def register_graph_routes(app: Flask): + """Register citation graph API routes.""" + + @app.route("/api/graph/network") + def graph_network(): + """Build citation network for a DOI.""" + doi = request.args.get("doi") + if not doi: + return jsonify({"error": "DOI parameter required"}), 400 + + try: + top_n = int(request.args.get("top_n", 20)) + top_n = max(1, min(50, top_n)) + weight_coupling = float(request.args.get("weight_coupling", 2.0)) + weight_cocitation = float(request.args.get("weight_cocitation", 2.0)) + weight_direct = float(request.args.get("weight_direct", 1.0)) + except ValueError as e: + return jsonify({"error": f"Invalid parameter: {e}"}), 400 + + use_cache = request.args.get("no_cache", "false").lower() != "true" + + # Check cache + cache_key = _make_cache_key( + "net", + doi, + top_n=top_n, + wc=weight_coupling, + wco=weight_cocitation, + wd=weight_direct, + ) + if use_cache: + cached = _cache_get(cache_key) + if cached: + cached["metadata"]["cached"] = True + return jsonify(cached) + + # Build network + builder = _get_builder() + if not builder: + return jsonify({"error": "CrossRef database not configured"}), 503 + + try: + graph = builder.build( + seed_doi=doi, + top_n=top_n, + weight_coupling=weight_coupling, + weight_cocitation=weight_cocitation, + weight_direct=weight_direct, + ) + result = graph.to_dict() + result["metadata"]["cached"] = False + + # Mark seed node + for node in result["nodes"]: + node["is_seed"] = node["id"].lower() == doi.lower() + + _cache_set(cache_key, result) + return jsonify(result) + + except FileNotFoundError: + return jsonify({"error": "CrossRef database not found"}), 503 + except Exception as e: + logger.error(f"Error building network for {doi}: {e}", exc_info=True) + return jsonify({"error": f"Failed to build network: {e}"}), 500 + + @app.route("/api/graph/related") + def graph_related(): + """Get related papers for a DOI.""" + doi = request.args.get("doi") + if not doi: + return jsonify({"error": "DOI parameter required"}), 400 + + try: + limit = int(request.args.get("limit", 10)) + limit = max(1, min(30, limit)) + except ValueError as e: + return jsonify({"error": f"Invalid parameter: {e}"}), 400 + + builder = _get_builder() + if not builder: + return jsonify({"error": "CrossRef database not configured"}), 503 + + try: + graph = builder.build(seed_doi=doi, top_n=limit) + result = graph.to_dict() + + # Sort by similarity, exclude seed + related = sorted( + [n for n in result["nodes"] if n["id"].lower() != doi.lower()], + key=lambda n: n.get("similarity_score", 0), + reverse=True, + )[:limit] + + return jsonify({"doi": doi, "related": related, "count": len(related)}) + + except Exception as e: + logger.error(f"Error getting related papers for {doi}: {e}", exc_info=True) + return jsonify({"error": f"Failed to get related papers: {e}"}), 500 + + @app.route("/api/graph/paper") + def graph_paper(): + """Get paper summary.""" + doi = request.args.get("doi") + if not doi: + return jsonify({"error": "DOI parameter required"}), 400 + + builder = _get_builder() + if not builder: + return jsonify({"error": "CrossRef database not configured"}), 503 + + try: + summary = builder.get_paper_summary(doi) + if summary: + return jsonify(summary) + return jsonify({"error": "Paper not found"}), 404 + + except Exception as e: + logger.error(f"Error getting paper summary for {doi}: {e}", exc_info=True) + return jsonify({"error": f"Failed to get summary: {e}"}), 500 + + @app.route("/api/graph/health") + def graph_health(): + """Health check for citation graph service.""" + db_path = current_app.config.get("CROSSREF_DB_PATH") + if not db_path: + return jsonify( + {"status": "unhealthy", "error": "No database configured"} + ), 503 + + try: + builder = _get_builder() + summary = builder.get_paper_summary("10.1038/s41586-020-2008-3") + return jsonify( + { + "status": "healthy" if summary else "degraded", + "database": db_path, + "database_accessible": True, + } + ) + except Exception as e: + return jsonify( + { + "status": "unhealthy", + "database": db_path, + "error": str(e), + } + ), 503 + + +# EOF diff --git a/src/scitex/scholar/gui/static/css/scholar.css b/src/scitex/scholar/gui/static/css/scholar.css new file mode 100644 index 000000000..2e316d412 --- /dev/null +++ b/src/scitex/scholar/gui/static/css/scholar.css @@ -0,0 +1,727 @@ +/** + * Scholar GUI - Consolidated Styles + * Dark theme, ported from scitex-cloud. + */ + +/* ── CSS Variables ─────────────────────────────────────────── */ +:root { + --bg-primary: #0d0d0d; + --bg-secondary: #151515; + --bg-tertiary: #1a1a1a; + --bg-monaco: #1e1e1e; + --border-default: #3a3a3a; + --text-primary: #d4e1e8; + --text-secondary: #a0b3c0; + --text-muted: #6c8ba0; + --text-inverse: #fff; + --accent: #059669; + --accent-hover: #047857; + --status-error: #ef4444; + --edge-color: #3a3a3a; +} + +/* ── Reset & Base ──────────────────────────────────────────── */ +* { + box-sizing: border-box; + margin: 0; + padding: 0; +} + +body { + font-family: + -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen, Ubuntu, + sans-serif; + background: var(--bg-primary); + color: var(--text-primary); + line-height: 1.5; +} + +a { + color: var(--accent); + text-decoration: none; +} +a:hover { + text-decoration: underline; +} + +/* ── Layout ────────────────────────────────────────────────── */ +.app-header { + display: flex; + align-items: center; + gap: 16px; + padding: 12px 24px; + background: var(--bg-secondary); + border-bottom: 1px solid var(--border-default); +} + +.app-header h1 { + font-size: 18px; + font-weight: 600; + color: var(--text-primary); +} + +.app-header .subtitle { + font-size: 12px; + color: var(--text-muted); +} + +.app-container { + display: flex; + height: calc(100vh - 49px); +} + +.app-sidebar { + width: 240px; + background: var(--bg-secondary); + border-right: 1px solid var(--border-default); + padding: 16px; + display: flex; + flex-direction: column; + gap: 12px; + flex-shrink: 0; +} + +.app-main { + flex: 1; + overflow-y: auto; + padding: 0; +} + +/* ── Tabs ──────────────────────────────────────────────────── */ +.tab-nav { + display: flex; + border-bottom: 1px solid var(--border-default); + background: var(--bg-secondary); + padding: 0 16px; +} + +.tab-btn { + padding: 12px 20px; + background: none; + border: none; + border-bottom: 2px solid transparent; + color: var(--text-muted); + font-size: 13px; + font-weight: 500; + cursor: pointer; + transition: + color 0.15s, + border-color 0.15s; +} + +.tab-btn:hover { + color: var(--text-primary); +} +.tab-btn.active { + color: var(--accent); + border-bottom-color: var(--accent); +} + +.tab-panel { + display: none; +} +.tab-panel.active { + display: block; +} + +/* ── Sidebar Sections ──────────────────────────────────────── */ +.sidebar-section { + background: var(--bg-secondary); + border: 1px solid var(--border-default); +} + +.sidebar-section__header { + display: flex; + justify-content: space-between; + align-items: center; + padding: 10px 12px; + cursor: pointer; +} + +.sidebar-section__header:hover { + background: var(--bg-tertiary); +} + +.sidebar-section__title { + font-size: 12px; + font-weight: 600; + color: var(--text-primary); +} + +.sidebar-section__content { + padding: 12px; + border-top: 1px solid var(--border-default); +} + +/* ── Status Indicators ─────────────────────────────────────── */ +.status-indicator { + display: flex; + align-items: center; + gap: 6px; + font-size: 12px; +} + +.status-healthy { + color: var(--accent); +} +.status-warning { + color: #f59e0b; +} +.status-error { + color: var(--status-error); +} +.status-detail { + display: block; + margin-top: 4px; + font-size: 11px; + color: var(--text-muted); +} + +/* ── Citation Graph Container ──────────────────────────────── */ +.citation-graph-container { + display: flex; + flex-direction: column; + gap: 16px; + padding: 16px; + flex: 1; + min-height: 0; +} + +/* ── Cards ─────────────────────────────────────────────────── */ +.graph-card { + background: var(--bg-secondary); + border: 1px solid var(--border-default); +} + +.graph-card--full { + flex: 1; + display: flex; + flex-direction: column; + min-height: 400px; +} + +.graph-card__header { + display: flex; + justify-content: space-between; + align-items: center; + padding: 12px 16px; + border-bottom: 1px solid var(--border-default); + background: var(--bg-tertiary); +} + +.graph-card__title { + display: flex; + align-items: center; + gap: 8px; + font-size: 14px; + font-weight: 600; + color: var(--text-primary); +} + +.graph-card__body { + padding: 16px; + flex: 1; + min-height: 0; +} + +/* ── Forms ──────────────────────────────────────────────────── */ +.graph-description { + margin-bottom: 16px; + font-size: 13px; + color: var(--text-muted); + line-height: 1.5; +} + +.graph-form .form-row { + display: flex; + gap: 16px; + align-items: flex-end; +} + +.form-group--doi { + flex: 1; +} +.form-group--options { + width: 140px; +} + +.form-label { + display: block; + margin-bottom: 6px; + font-size: 12px; + font-weight: 500; + color: var(--text-secondary); + text-transform: uppercase; + letter-spacing: 0.5px; +} + +.input-wrapper { + display: flex; + gap: 8px; +} + +.form-input, +.form-select { + background: var(--bg-primary); + border: 1px solid var(--border-default); + color: var(--text-primary); + padding: 10px 12px; + font-size: 14px; + transition: + border-color 0.15s, + box-shadow 0.15s; +} + +.form-input { + flex: 1; +} + +.form-input:focus, +.form-select:focus { + outline: none; + border-color: var(--accent); + box-shadow: 0 0 0 2px rgba(5, 150, 105, 0.15); +} + +.form-input::placeholder { + color: var(--text-muted); +} + +/* ── Buttons ───────────────────────────────────────────────── */ +.btn-build { + display: inline-flex; + align-items: center; + gap: 8px; + padding: 10px 16px; + background: var(--accent); + border: none; + color: var(--text-inverse); + font-size: 14px; + font-weight: 500; + cursor: pointer; + transition: background 0.15s; + white-space: nowrap; +} + +.btn-build:hover { + background: var(--accent-hover); +} + +.graph-controls { + display: flex; + gap: 4px; +} + +.btn-control { + width: 32px; + height: 32px; + display: flex; + align-items: center; + justify-content: center; + background: var(--bg-primary); + border: 1px solid var(--border-default); + color: var(--text-muted); + cursor: pointer; + font-size: 14px; + transition: all 0.15s; +} + +.btn-control:hover { + border-color: var(--accent); + color: var(--accent); +} + +/* ── Graph Canvas ──────────────────────────────────────────── */ +.graph-canvas-wrapper { + position: relative; + background: var(--bg-monaco); + overflow: hidden; +} + +#graphCanvas { + width: 100%; + height: 100%; + min-height: 400px; +} + +.citation-graph-svg { + display: block; +} + +/* ── Graph Elements ────────────────────────────────────────── */ +.graph-edge { + stroke: var(--edge-color); + stroke-opacity: 0.6; + transition: + stroke 0.15s, + stroke-opacity 0.15s; +} + +.graph-edge.edge-coupling { + stroke: #3b82f6; +} +.graph-edge.edge-cocitation { + stroke: #8b5cf6; +} +.graph-edge.edge-direct, +.graph-edge.edge-cites { + stroke: #f59e0b; +} + +.graph-node { + cursor: pointer; +} +.graph-node:hover { + filter: brightness(1.3); +} +.graph-node.selected .node-circle { + stroke: #fff; + stroke-width: 3; +} + +.node-circle { + stroke: var(--bg-primary); + stroke-width: 2; + transition: fill 0.15s; +} +.node-seed .node-circle { + fill: var(--accent); +} +.node-related .node-circle { + fill: var(--text-muted); +} + +.node-year-ring { + stroke: var(--border-default); + stroke-opacity: 0.5; +} + +.node-label { + font-size: 10px; + font-weight: 600; + fill: var(--accent); + text-transform: uppercase; + letter-spacing: 1px; +} + +/* ── Tooltip ───────────────────────────────────────────────── */ +.graph-tooltip { + position: fixed; + z-index: 1000; + max-width: 300px; + padding: 12px; + background: var(--bg-tertiary); + border: 1px solid var(--border-default); + box-shadow: 0 4px 12px rgba(0, 0, 0, 0.4); + pointer-events: none; + transform: translate(-50%, -100%); + animation: tooltipIn 0.15s ease-out; +} + +@keyframes tooltipIn { + from { + opacity: 0; + transform: translate(-50%, -90%); + } + to { + opacity: 1; + transform: translate(-50%, -100%); + } +} + +.tooltip-title { + font-size: 13px; + font-weight: 600; + color: var(--text-primary); + margin-bottom: 6px; +} +.tooltip-authors { + font-size: 12px; + color: var(--text-muted); + margin-bottom: 8px; +} +.tooltip-meta { + display: flex; + gap: 12px; + font-size: 11px; +} +.tooltip-year { + color: var(--text-secondary); +} +.tooltip-score { + color: var(--accent); +} +.tooltip-hint { + margin-top: 8px; + padding-top: 8px; + border-top: 1px solid var(--border-default); + font-size: 11px; + color: var(--text-muted); + font-style: italic; +} + +/* ── Node Details Panel ────────────────────────────────────── */ +.node-details-panel { + position: absolute; + top: 16px; + right: 16px; + width: 280px; + background: var(--bg-secondary); + border: 1px solid var(--border-default); + box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3); + animation: panelIn 0.2s ease-out; +} + +@keyframes panelIn { + from { + opacity: 0; + transform: translateX(20px); + } + to { + opacity: 1; + transform: translateX(0); + } +} + +.node-details-header { + display: flex; + justify-content: space-between; + align-items: center; + padding: 12px; + background: var(--bg-tertiary); + border-bottom: 1px solid var(--border-default); +} + +.node-details-header h6 { + margin: 0; + font-size: 12px; + font-weight: 600; + color: var(--text-primary); +} + +.btn-close-panel { + background: none; + border: none; + color: var(--text-muted); + cursor: pointer; + padding: 4px; + font-size: 18px; +} +.btn-close-panel:hover { + color: var(--text-primary); +} + +.node-details-content { + padding: 12px; +} +.detail-title { + font-size: 13px; + font-weight: 500; + color: var(--text-primary); + margin-bottom: 8px; + line-height: 1.4; +} +.detail-authors { + font-size: 12px; + color: var(--text-muted); + margin-bottom: 8px; +} +.detail-year, +.detail-score { + font-size: 12px; + color: var(--text-secondary); + margin-bottom: 4px; +} +.detail-score strong { + color: var(--accent); +} +.detail-doi { + margin-top: 12px; + padding-top: 12px; + border-top: 1px solid var(--border-default); +} +.detail-doi a { + font-size: 12px; + color: var(--accent); +} + +/* ── Related Papers ────────────────────────────────────────── */ +.related-papers-section .graph-card__body { + max-height: 300px; + overflow-y: auto; + padding: 0; +} + +.related-paper-item { + display: flex; + align-items: center; + gap: 12px; + padding: 12px 16px; + border-bottom: 1px solid var(--border-default); + cursor: pointer; + transition: background 0.15s; +} + +.related-paper-item:last-child { + border-bottom: none; +} +.related-paper-item:hover { + background: var(--bg-tertiary); +} + +.paper-rank { + flex-shrink: 0; + width: 24px; + height: 24px; + display: flex; + align-items: center; + justify-content: center; + background: var(--bg-tertiary); + font-size: 11px; + font-weight: 600; + color: var(--text-muted); +} + +.paper-info { + flex: 1; + min-width: 0; +} +.paper-title { + font-size: 13px; + font-weight: 500; + color: var(--text-primary); + margin-bottom: 4px; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} +.paper-meta { + display: flex; + gap: 8px; + font-size: 11px; + color: var(--text-muted); +} + +.paper-score { + flex-shrink: 0; + display: flex; + flex-direction: column; + align-items: flex-end; + gap: 4px; +} +.score-bar { + width: 60px; + height: 4px; + background: var(--bg-tertiary); + overflow: hidden; +} +.score-fill { + height: 100%; + background: var(--accent); + transition: width 0.3s ease-out; +} +.score-value { + font-size: 11px; + font-weight: 600; + color: var(--accent); +} + +/* ── Loading & Error States ────────────────────────────────── */ +.graph-loading { + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + padding: 48px; + background: var(--bg-secondary); + border: 1px solid var(--border-default); +} + +.loading-spinner { + position: relative; + width: 60px; + height: 60px; +} + +.spinner-ring { + position: absolute; + inset: 0; + border: 2px solid transparent; + border-top-color: var(--accent); + border-radius: 50%; + animation: spin 1.2s linear infinite; +} + +.spinner-ring:nth-child(2) { + inset: 6px; + animation-delay: 0.15s; + border-top-color: rgba(5, 150, 105, 0.6); +} +.spinner-ring:nth-child(3) { + inset: 12px; + animation-delay: 0.3s; + border-top-color: rgba(5, 150, 105, 0.3); +} + +@keyframes spin { + to { + transform: rotate(360deg); + } +} + +.loading-text { + margin-top: 16px; + font-size: 13px; + color: var(--text-muted); +} + +.graph-error { + display: flex; + flex-direction: column; + align-items: center; + padding: 48px; + text-align: center; + background: var(--bg-secondary); + border: 1px solid var(--border-default); +} + +.graph-error p { + font-size: 14px; + color: var(--text-primary); + margin-bottom: 16px; +} + +.empty-message, +.error-message { + padding: 24px; + text-align: center; + font-size: 13px; + color: var(--text-muted); +} +.error-message { + color: var(--status-error); +} + +/* ── Placeholder Tabs ──────────────────────────────────────── */ +.tab-placeholder { + display: flex; + align-items: center; + justify-content: center; + padding: 80px 40px; + text-align: center; + color: var(--text-muted); +} + +.tab-placeholder h3 { + font-size: 18px; + color: var(--text-secondary); + margin-bottom: 8px; +} +.tab-placeholder p { + font-size: 14px; +} + +/* ── Hidden ────────────────────────────────────────────────── */ +.hidden { + display: none !important; +} diff --git a/src/scitex/scholar/gui/static/js/app.js b/src/scitex/scholar/gui/static/js/app.js new file mode 100644 index 000000000..7334a852d --- /dev/null +++ b/src/scitex/scholar/gui/static/js/app.js @@ -0,0 +1,22 @@ +/** + * Scholar GUI - Tab Manager + * + * Manages tab switching for the Scholar SPA. + */ +document.addEventListener("DOMContentLoaded", () => { + const tabs = document.querySelectorAll(".tab-btn"); + const panels = document.querySelectorAll(".tab-panel"); + + tabs.forEach((tab) => { + tab.addEventListener("click", () => { + const target = tab.dataset.tab; + + tabs.forEach((t) => t.classList.remove("active")); + panels.forEach((p) => p.classList.remove("active")); + + tab.classList.add("active"); + const panel = document.getElementById("tab-" + target); + if (panel) panel.classList.add("active"); + }); + }); +}); diff --git a/src/scitex/scholar/gui/static/js/graph/ForceSimulation.js b/src/scitex/scholar/gui/static/js/graph/ForceSimulation.js new file mode 100644 index 000000000..e463e6cb8 --- /dev/null +++ b/src/scitex/scholar/gui/static/js/graph/ForceSimulation.js @@ -0,0 +1,137 @@ +/** + * ForceSimulation - Force-directed graph physics engine + * + * Repulsion, attraction, centering, velocity damping. + * Ported from scitex-cloud (standalone, no ES6 modules). + */ +class ForceSimulation { + constructor(nodes, edges, width, height) { + this.nodes = nodes; + this.edges = edges; + this.width = width; + this.height = height; + this.alpha = 1; + this.alphaDecay = 0.02; + this.alphaMin = 0.001; + this.tickCallback = null; + this.animationId = null; + } + + onTick(callback) { + this.tickCallback = callback; + } + + start() { + this.alpha = 1; + this.tick(); + } + + stop() { + if (this.animationId) { + cancelAnimationFrame(this.animationId); + this.animationId = null; + } + } + + reheat() { + this.alpha = Math.max(this.alpha, 0.3); + if (!this.animationId) this.tick(); + } + + tick() { + if (this.alpha < this.alphaMin) { + this.animationId = null; + return; + } + this.applyForces(); + this.alpha *= 1 - this.alphaDecay; + if (this.tickCallback) this.tickCallback(); + this.animationId = requestAnimationFrame(() => this.tick()); + } + + applyForces() { + const centerX = this.width / 2; + const centerY = this.height / 2; + + this.nodes.forEach((node) => { + if (node.fx != null) node.x = node.fx; + if (node.fy != null) node.y = node.fy; + }); + + this.applyRepulsion(); + this.applyAttraction(); + this.applyCentering(centerX, centerY); + this.applyVelocities(); + } + + applyRepulsion() { + for (let i = 0; i < this.nodes.length; i++) { + for (let j = i + 1; j < this.nodes.length; j++) { + const a = this.nodes[i]; + const b = this.nodes[j]; + const dx = (b.x || 0) - (a.x || 0); + const dy = (b.y || 0) - (a.y || 0); + const dist = Math.sqrt(dx * dx + dy * dy) || 1; + const force = (500 / (dist * dist)) * this.alpha; + const fx = (dx / dist) * force; + const fy = (dy / dist) * force; + + if (a.fx == null) { + a.vx = (a.vx || 0) - fx; + a.vy = (a.vy || 0) - fy; + } + if (b.fx == null) { + b.vx = (b.vx || 0) + fx; + b.vy = (b.vy || 0) + fy; + } + } + } + } + + applyAttraction() { + this.edges.forEach((edge) => { + const source = edge.source; + const target = edge.target; + const dx = (target.x || 0) - (source.x || 0); + const dy = (target.y || 0) - (source.y || 0); + const dist = Math.sqrt(dx * dx + dy * dy) || 1; + const force = (dist - 100) * 0.05 * this.alpha; + const fx = (dx / dist) * force; + const fy = (dy / dist) * force; + + if (source.fx == null) { + source.vx = (source.vx || 0) + fx; + source.vy = (source.vy || 0) + fy; + } + if (target.fx == null) { + target.vx = (target.vx || 0) - fx; + target.vy = (target.vy || 0) - fy; + } + }); + } + + applyCentering(centerX, centerY) { + this.nodes.forEach((node) => { + if (node.fx == null) { + node.vx = + (node.vx || 0) + (centerX - (node.x || 0)) * 0.01 * this.alpha; + node.vy = + (node.vy || 0) + (centerY - (node.y || 0)) * 0.01 * this.alpha; + } + }); + } + + applyVelocities() { + const padding = 50; + this.nodes.forEach((node) => { + if (node.fx == null) { + node.vx = (node.vx || 0) * 0.6; + node.vy = (node.vy || 0) * 0.6; + node.x = (node.x || 0) + (node.vx || 0); + node.y = (node.y || 0) + (node.vy || 0); + node.x = Math.max(padding, Math.min(this.width - padding, node.x)); + node.y = Math.max(padding, Math.min(this.height - padding, node.y)); + } + }); + } +} diff --git a/src/scitex/scholar/gui/static/js/graph/GraphRenderer.js b/src/scitex/scholar/gui/static/js/graph/GraphRenderer.js new file mode 100644 index 000000000..9b4ab4693 --- /dev/null +++ b/src/scitex/scholar/gui/static/js/graph/GraphRenderer.js @@ -0,0 +1,233 @@ +/** + * GraphRenderer - SVG graph rendering for citation networks + * + * Handles SVG creation, node/edge rendering, position updates. + * Ported from scitex-cloud (standalone, no ES6 modules). + */ +class GraphRenderer { + constructor(callbacks) { + this.svg = null; + this.simulation = null; + this.callbacks = callbacks; + } + + getSvg() { + return this.svg; + } + + getSimulation() { + return this.simulation; + } + + render(container, nodes, edges) { + const width = container.clientWidth || 800; + const height = container.clientHeight || 500; + container.innerHTML = ""; + + const svg = document.createElementNS("http://www.w3.org/2000/svg", "svg"); + svg.setAttribute("width", "100%"); + svg.setAttribute("height", "100%"); + svg.setAttribute("viewBox", `0 0 ${width} ${height}`); + svg.id = "citationGraphSvg"; + svg.classList.add("citation-graph-svg"); + this.svg = svg; + + const mainGroup = document.createElementNS( + "http://www.w3.org/2000/svg", + "g", + ); + mainGroup.id = "graphMainGroup"; + + this.createDefs(svg); + + const edgeGroup = document.createElementNS( + "http://www.w3.org/2000/svg", + "g", + ); + edgeGroup.setAttribute("class", "graph-edges"); + const nodeGroup = document.createElementNS( + "http://www.w3.org/2000/svg", + "g", + ); + nodeGroup.setAttribute("class", "graph-nodes"); + + mainGroup.appendChild(edgeGroup); + mainGroup.appendChild(nodeGroup); + svg.appendChild(mainGroup); + container.appendChild(svg); + + this.initializePositions(nodes, width, height); + + const nodeMap = new Map(nodes.map((n) => [n.id, n])); + const resolvedEdges = this.resolveEdges(edges, nodeMap); + + this.simulation = new ForceSimulation(nodes, resolvedEdges, width, height); + this.simulation.onTick(() => + this.updatePositions(edgeGroup, nodeGroup, nodes, resolvedEdges), + ); + this.simulation.start(); + + this.createElements(edgeGroup, nodeGroup, nodes, resolvedEdges); + } + + createDefs(svg) { + const defs = document.createElementNS("http://www.w3.org/2000/svg", "defs"); + const marker = document.createElementNS( + "http://www.w3.org/2000/svg", + "marker", + ); + marker.setAttribute("id", "arrowhead"); + marker.setAttribute("viewBox", "0 -5 10 10"); + marker.setAttribute("refX", "20"); + marker.setAttribute("refY", "0"); + marker.setAttribute("markerWidth", "6"); + marker.setAttribute("markerHeight", "6"); + marker.setAttribute("orient", "auto"); + + const arrow = document.createElementNS( + "http://www.w3.org/2000/svg", + "path", + ); + arrow.setAttribute("d", "M0,-5L10,0L0,5"); + arrow.setAttribute("fill", "#3a3a3a"); + marker.appendChild(arrow); + defs.appendChild(marker); + svg.appendChild(defs); + } + + initializePositions(nodes, width, height) { + const cx = width / 2; + const cy = height / 2; + nodes.forEach((node, i) => { + if (node.is_seed) { + node.x = cx; + node.y = cy; + node.fx = cx; + node.fy = cy; + } else { + const angle = (2 * Math.PI * i) / nodes.length; + const radius = Math.min(width, height) * 0.3; + node.x = cx + radius * Math.cos(angle); + node.y = cy + radius * Math.sin(angle); + } + node.vx = 0; + node.vy = 0; + }); + } + + resolveEdges(edges, nodeMap) { + return edges + .map((e) => ({ + ...e, + source: typeof e.source === "string" ? nodeMap.get(e.source) : e.source, + target: typeof e.target === "string" ? nodeMap.get(e.target) : e.target, + })) + .filter((e) => e.source && e.target); + } + + createElements(edgeGroup, nodeGroup, nodes, edges) { + edges.forEach((edge) => { + const line = document.createElementNS( + "http://www.w3.org/2000/svg", + "line", + ); + line.setAttribute("class", `graph-edge edge-${edge.type || "cites"}`); + line.setAttribute("data-source", edge.source.id); + line.setAttribute("data-target", edge.target.id); + line.setAttribute( + "stroke-width", + String(Math.max(1, Math.min((edge.weight || 10) / 20, 3))), + ); + edgeGroup.appendChild(line); + }); + + nodes.forEach((node) => { + const group = document.createElementNS("http://www.w3.org/2000/svg", "g"); + group.setAttribute( + "class", + `graph-node ${node.is_seed ? "node-seed" : "node-related"}`, + ); + group.setAttribute("data-id", node.id); + + const radius = node.is_seed + ? 16 + : Math.max(8, Math.min(12, (node.similarity_score || 10) / 5)); + + const yearRing = document.createElementNS( + "http://www.w3.org/2000/svg", + "circle", + ); + yearRing.setAttribute("r", String(radius + 3)); + yearRing.setAttribute("class", "node-year-ring"); + yearRing.setAttribute("fill", "none"); + yearRing.setAttribute("stroke-width", "2"); + + const circle = document.createElementNS( + "http://www.w3.org/2000/svg", + "circle", + ); + circle.setAttribute("r", String(radius)); + circle.setAttribute("class", "node-circle"); + + group.appendChild(yearRing); + group.appendChild(circle); + + if (node.is_seed) { + const label = document.createElementNS( + "http://www.w3.org/2000/svg", + "text", + ); + label.setAttribute("class", "node-label"); + label.setAttribute("dy", String(radius + 16)); + label.setAttribute("text-anchor", "middle"); + label.textContent = "SEED"; + group.appendChild(label); + } + + group.addEventListener("mouseenter", () => + this.callbacks.onNodeHover(node, group), + ); + group.addEventListener("mouseleave", () => this.callbacks.onNodeLeave()); + group.addEventListener("click", () => this.callbacks.onNodeClick(node)); + group.addEventListener("mousedown", (e) => + this.callbacks.onNodeDragStart(e, node), + ); + + nodeGroup.appendChild(group); + }); + } + + updatePositions(edgeGroup, nodeGroup, nodes, edges) { + const lines = edgeGroup.querySelectorAll("line"); + lines.forEach((line, i) => { + const edge = edges[i]; + if (!edge) return; + line.setAttribute("x1", String(edge.source.x || 0)); + line.setAttribute("y1", String(edge.source.y || 0)); + line.setAttribute("x2", String(edge.target.x || 0)); + line.setAttribute("y2", String(edge.target.y || 0)); + }); + + const nodeEls = nodeGroup.querySelectorAll(".graph-node"); + nodeEls.forEach((el) => { + const nodeId = el.getAttribute("data-id"); + const node = nodes.find((n) => n.id === nodeId); + if (node) { + el.setAttribute( + "transform", + `translate(${node.x || 0}, ${node.y || 0})`, + ); + } + }); + } + + applyTransform(transform) { + const g = document.getElementById("graphMainGroup"); + if (g) { + g.setAttribute( + "transform", + `translate(${transform.x}, ${transform.y}) scale(${transform.k})`, + ); + } + } +} diff --git a/src/scitex/scholar/gui/static/js/graph/citation-graph.js b/src/scitex/scholar/gui/static/js/graph/citation-graph.js new file mode 100644 index 000000000..995660319 --- /dev/null +++ b/src/scitex/scholar/gui/static/js/graph/citation-graph.js @@ -0,0 +1,407 @@ +/** + * CitationGraphManager - Main controller for citation graph visualization + * + * Handles form submission, graph rendering, zoom/pan, tooltips, node selection. + * Ported from scitex-cloud (standalone, no ES6 modules). + * + * Depends on: ForceSimulation.js, GraphRenderer.js (loaded before this). + */ +class CitationGraphManager { + constructor() { + this.currentData = null; + this.transform = { x: 0, y: 0, k: 1 }; + this.isDragging = false; + this.selectedNode = null; + + this.renderer = new GraphRenderer({ + onNodeHover: (node, el) => this.showNodeTooltip(node, el), + onNodeLeave: () => this.hideNodeTooltip(), + onNodeClick: (node) => this.selectNode(node), + onNodeDragStart: (e, node) => this.startNodeDrag(e, node), + }); + + this.init(); + } + + init() { + this.bindEvents(); + this.checkServiceHealth(); + } + + bindEvents() { + const form = document.getElementById("graphForm"); + if (form) form.addEventListener("submit", (e) => this.handleSubmit(e)); + + const resetBtn = document.getElementById("resetZoomBtn"); + if (resetBtn) resetBtn.addEventListener("click", () => this.resetView()); + + const downloadBtn = document.getElementById("downloadSvgBtn"); + if (downloadBtn) + downloadBtn.addEventListener("click", () => this.downloadSvg()); + + const fitBtn = document.getElementById("fitViewBtn"); + if (fitBtn) fitBtn.addEventListener("click", () => this.fitToView()); + } + + async checkServiceHealth() { + const el = document.getElementById("serviceStatus"); + if (!el) return; + + try { + const resp = await fetch("/api/graph/health"); + const data = await resp.json(); + if (data.status === "healthy") { + el.innerHTML = + '● Service available'; + } else { + el.innerHTML = + '● Service limited' + + '' + + (data.error || "Unknown") + + ""; + } + } catch { + el.innerHTML = + '● Service unavailable'; + } + } + + async handleSubmit(e) { + e.preventDefault(); + const doiInput = document.getElementById("doiInput"); + const topNSelect = document.getElementById("topN"); + if (!doiInput || !doiInput.value.trim()) { + this.showError("Please enter a DOI"); + return; + } + + const doi = doiInput.value.trim(); + const topN = parseInt(topNSelect ? topNSelect.value : "20", 10); + + this.showLoading(true); + this.hideError(); + + try { + const url = `/api/graph/network?doi=${encodeURIComponent(doi)}&top_n=${topN}`; + const resp = await fetch(url); + if (!resp.ok) { + const err = await resp.json(); + throw new Error(err.error || "Failed to build network"); + } + + const data = await resp.json(); + this.currentData = data; + this.renderGraph(data); + this.fetchRelatedPapers(doi, topN); + } catch (err) { + this.showError(err.message || "An error occurred"); + } finally { + this.showLoading(false); + } + } + + renderGraph(data) { + const container = document.getElementById("graphVisualization"); + const canvas = document.getElementById("graphCanvas"); + if (!container || !canvas) return; + + container.classList.remove("hidden"); + + const titleEl = document.getElementById("graphTitle"); + if (titleEl) { + const seed = data.nodes.find((n) => n.is_seed); + titleEl.textContent = seed + ? "Network: " + seed.title.substring(0, 60) + "..." + : "Citation Network"; + } + + const statsEl = document.getElementById("graphStats"); + if (statsEl) { + statsEl.textContent = `${data.nodes.length} nodes, ${data.edges.length} edges`; + } + + this.renderer.render(canvas, data.nodes, data.edges); + this.setupZoomPan(canvas); + } + + setupZoomPan(container) { + const svg = this.renderer.getSvg(); + if (!svg) return; + + let isPanning = false; + let startX = 0; + let startY = 0; + + svg.addEventListener("wheel", (e) => { + e.preventDefault(); + const factor = e.deltaY > 0 ? 0.9 : 1.1; + const rect = svg.getBoundingClientRect(); + const mx = e.clientX - rect.left; + const my = e.clientY - rect.top; + + const newK = Math.max(0.1, Math.min(5, this.transform.k * factor)); + this.transform.x = + mx - (mx - this.transform.x) * (newK / this.transform.k); + this.transform.y = + my - (my - this.transform.y) * (newK / this.transform.k); + this.transform.k = newK; + this.renderer.applyTransform(this.transform); + }); + + svg.addEventListener("mousedown", (e) => { + if (e.target === svg || e.target.closest(".graph-edges")) { + isPanning = true; + startX = e.clientX - this.transform.x; + startY = e.clientY - this.transform.y; + svg.style.cursor = "grabbing"; + } + }); + + svg.addEventListener("mousemove", (e) => { + if (isPanning && !this.isDragging) { + this.transform.x = e.clientX - startX; + this.transform.y = e.clientY - startY; + this.renderer.applyTransform(this.transform); + } + }); + + svg.addEventListener("mouseup", () => { + isPanning = false; + svg.style.cursor = "grab"; + }); + svg.addEventListener("mouseleave", () => { + isPanning = false; + svg.style.cursor = "grab"; + }); + svg.style.cursor = "grab"; + } + + startNodeDrag(e, node) { + e.stopPropagation(); + this.isDragging = true; + const svg = this.renderer.getSvg(); + const rect = svg.getBoundingClientRect(); + + const onMove = (ev) => { + const x = (ev.clientX - rect.left - this.transform.x) / this.transform.k; + const y = (ev.clientY - rect.top - this.transform.y) / this.transform.k; + node.fx = x; + node.fy = y; + node.x = x; + node.y = y; + const sim = this.renderer.getSimulation(); + if (sim) sim.reheat(); + }; + + const onUp = () => { + this.isDragging = false; + if (!node.is_seed) { + node.fx = null; + node.fy = null; + } + document.removeEventListener("mousemove", onMove); + document.removeEventListener("mouseup", onUp); + }; + + document.addEventListener("mousemove", onMove); + document.addEventListener("mouseup", onUp); + } + + showNodeTooltip(node, element) { + const old = document.getElementById("graphTooltip"); + if (old) old.remove(); + + const tip = document.createElement("div"); + tip.id = "graphTooltip"; + tip.className = "graph-tooltip"; + tip.innerHTML = ` +
${this.esc(node.title)}
+
${(node.authors || []).slice(0, 3).join(", ")}${(node.authors || []).length > 3 ? "..." : ""}
+
+ ${node.year || "?"} + ${node.similarity_score ? 'Score: ' + node.similarity_score.toFixed(1) + "" : ""} +
+
Click to view details
+ `; + document.body.appendChild(tip); + + const rect = element.getBoundingClientRect(); + tip.style.left = rect.left + rect.width / 2 + "px"; + tip.style.top = rect.top - 10 + "px"; + } + + hideNodeTooltip() { + const el = document.getElementById("graphTooltip"); + if (el) el.remove(); + } + + selectNode(node) { + this.selectedNode = node; + document + .querySelectorAll(".graph-node") + .forEach((el) => el.classList.remove("selected")); + const el = document.querySelector(`[data-id="${node.id}"]`); + if (el) el.classList.add("selected"); + this.showNodeDetails(node); + } + + showNodeDetails(node) { + const panel = document.getElementById("nodeDetailsPanel"); + if (!panel) return; + panel.classList.remove("hidden"); + panel.innerHTML = ` +
+
${node.is_seed ? "★ Seed Paper" : "Related Paper"}
+ +
+
+
${this.esc(node.title)}
+
${(node.authors || []).join(", ")}
+
Published: ${node.year || "?"}
+ ${node.similarity_score ? '
Similarity: ' + node.similarity_score.toFixed(2) + "
" : ""} +
View on DOI.org ↗
+
+ `; + } + + async fetchRelatedPapers(doi, limit) { + const container = document.getElementById("relatedPapersList"); + const content = document.getElementById("relatedPapersContent"); + if (!container || !content) return; + + try { + const url = `/api/graph/related?doi=${encodeURIComponent(doi)}&limit=${limit}`; + const resp = await fetch(url); + if (!resp.ok) throw new Error("Failed"); + + const data = await resp.json(); + const papers = data.related || []; + + content.innerHTML = + papers.length === 0 + ? '

No related papers found

' + : papers + .map( + (p, i) => ` + + `, + ) + .join(""); + + content.querySelectorAll(".related-paper-item").forEach((item) => { + item.addEventListener("click", () => { + const d = item.getAttribute("data-doi"); + if (d && this.currentData) { + const n = this.currentData.nodes.find((nd) => nd.id === d); + if (n) this.selectNode(n); + } + }); + }); + + container.classList.remove("hidden"); + } catch { + content.innerHTML = + '

Failed to load related papers

'; + container.classList.remove("hidden"); + } + } + + showLoading(show) { + const loading = document.getElementById("graphLoading"); + const viz = document.getElementById("graphVisualization"); + const related = document.getElementById("relatedPapersList"); + if (show) { + if (loading) loading.classList.remove("hidden"); + if (viz) viz.classList.add("hidden"); + if (related) related.classList.add("hidden"); + } else { + if (loading) loading.classList.add("hidden"); + } + } + + showError(msg) { + const el = document.getElementById("graphError"); + const msgEl = document.getElementById("graphErrorMessage"); + if (el && msgEl) { + msgEl.textContent = msg; + el.classList.remove("hidden"); + } + } + + hideError() { + const el = document.getElementById("graphError"); + if (el) el.classList.add("hidden"); + } + + resetView() { + this.transform = { x: 0, y: 0, k: 1 }; + this.renderer.applyTransform(this.transform); + } + + fitToView() { + if (!this.currentData) return; + const svg = this.renderer.getSvg(); + if (!svg) return; + + const nodes = this.currentData.nodes; + if (nodes.length === 0) return; + + const minX = Math.min(...nodes.map((n) => n.x || 0)); + const maxX = Math.max(...nodes.map((n) => n.x || 0)); + const minY = Math.min(...nodes.map((n) => n.y || 0)); + const maxY = Math.max(...nodes.map((n) => n.y || 0)); + + const pad = 50; + const gw = maxX - minX + pad * 2; + const gh = maxY - minY + pad * 2; + const svgRect = svg.getBoundingClientRect(); + const scale = Math.min(svgRect.width / gw, svgRect.height / gh, 2); + + this.transform = { + x: svgRect.width / 2 - ((minX + maxX) / 2) * scale, + y: svgRect.height / 2 - ((minY + maxY) / 2) * scale, + k: scale, + }; + this.renderer.applyTransform(this.transform); + } + + downloadSvg() { + const svg = document.getElementById("citationGraphSvg"); + if (!svg) return; + + const data = new XMLSerializer().serializeToString(svg); + const blob = new Blob([data], { type: "image/svg+xml" }); + const url = URL.createObjectURL(blob); + const a = document.createElement("a"); + a.href = url; + a.download = "citation-graph.svg"; + document.body.appendChild(a); + a.click(); + document.body.removeChild(a); + URL.revokeObjectURL(url); + } + + esc(text) { + const d = document.createElement("div"); + d.textContent = text || ""; + return d.innerHTML; + } +} + +// Initialize on DOM ready +document.addEventListener("DOMContentLoaded", () => { + new CitationGraphManager(); +}); diff --git a/src/scitex/scholar/gui/templates/scholar.html b/src/scitex/scholar/gui/templates/scholar.html new file mode 100644 index 000000000..3a74fd4af --- /dev/null +++ b/src/scitex/scholar/gui/templates/scholar.html @@ -0,0 +1,212 @@ + + + + + + SciTeX Scholar + + + + +
+

SciTeX Scholar

+ Scientific Literature Management +
+
+ + + +
+ + + +
+
+ +
+
+ Build Citation Network +
+
+

+ Enter a DOI to build an interactive citation network. The graph shows + related papers based on bibliographic coupling, co-citation, and direct citations. +

+
+
+
+ +
+ + +
+
+
+ + +
+
+
+
+
+ + + + + + + + +
+
+ +
+
+
+

Library

+

Zotero-style paper management. Coming in Phase 2.

+
+
+
+ + + +
+
+
+

Metadata Enrichment

+

Enrich BibTeX with abstracts, citations, and impact factors. Coming in Phase 4.

+
+
+
+
+
+ + + + + + + diff --git a/src/scitex/scholar/integration/zotero/__init__.py b/src/scitex/scholar/integration/zotero/__init__.py index 820f3c837..8cde4b794 100755 --- a/src/scitex/scholar/integration/zotero/__init__.py +++ b/src/scitex/scholar/integration/zotero/__init__.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- """ Zotero integration for SciTeX Scholar module. @@ -17,11 +16,14 @@ from .exporter import ZoteroExporter from .importer import ZoteroImporter from .linker import ZoteroLinker +from .local_reader import ZoteroLocalReader, export_for_zotero from .mapper import ZoteroMapper __all__ = [ "ZoteroImporter", "ZoteroExporter", "ZoteroLinker", + "ZoteroLocalReader", "ZoteroMapper", + "export_for_zotero", ] diff --git a/src/scitex/scholar/integration/zotero/local_reader.py b/src/scitex/scholar/integration/zotero/local_reader.py new file mode 100755 index 000000000..f1b58e1ad --- /dev/null +++ b/src/scitex/scholar/integration/zotero/local_reader.py @@ -0,0 +1,397 @@ +#!/usr/bin/env python3 +""" +Zotero local SQLite reader — no API key required. + +Reads directly from Zotero's local database file (zotero.sqlite). +Auto-detects Linux and Windows (WSL) Zotero installations. + +Usage: + from scitex.scholar.integration.zotero import ZoteroLocalReader, export_for_zotero + + reader = ZoteroLocalReader() # auto-detect + papers = reader.read_all() # all items + papers = reader.read_by_tags(["EEG"]) # filter by tag + export_for_zotero(papers, "out.bib") # export for Zotero > File > Import +""" + +from __future__ import annotations + +import sqlite3 +from pathlib import Path +from typing import Dict, List, Optional + +from scitex.scholar.core.Papers import Papers + +from .mapper import ZoteroMapper + +# ── Known Zotero DB paths ───────────────────────────────────────────────────── + +_LINUX_PATH = Path("~/Zotero/zotero.sqlite").expanduser() +_WSL_BASE = Path("/mnt/c/Users") + +_SKIP_TYPES = {"attachment", "note", "annotation"} + + +# ── Reader ──────────────────────────────────────────────────────────────────── + + +class ZoteroLocalReader: + """Read papers from a local Zotero SQLite database. + + Parameters + ---------- + db_path : str or Path, optional + Path to zotero.sqlite. If None, auto-detects Linux then WSL paths. + project : str + Scholar project name for the returned Papers collection. + """ + + def __init__( + self, + db_path: Optional[str | Path] = None, + project: str = "default", + ): + self.db_path = Path(db_path) if db_path else self._detect_db_path() + self.project = project + self._mapper = ZoteroMapper() + + # ── Public methods ──────────────────────────────────────────────────────── + + def read_all(self, limit: Optional[int] = None) -> Papers: + """Read all non-attachment items from the Zotero library. + + Parameters + ---------- + limit : int, optional + Maximum number of items to return. + + Returns + ------- + Papers + """ + item_ids = self._fetch_item_ids(limit=limit) + return self._build_papers(item_ids) + + def read_by_collection(self, name: str) -> Papers: + """Read items belonging to a Zotero collection. + + Parameters + ---------- + name : str + Collection name (case-sensitive). + + Returns + ------- + Papers + """ + with self._connect() as conn: + rows = conn.execute( + """ + SELECT ci.itemID + FROM collectionItems ci + JOIN collections col ON ci.collectionID = col.collectionID + WHERE col.collectionName = ? + """, + (name,), + ).fetchall() + item_ids = [r[0] for r in rows] + return self._build_papers(item_ids) + + def read_by_tags(self, tags: List[str], match_all: bool = False) -> Papers: + """Read items matching given tags. + + Parameters + ---------- + tags : list of str + Tag names to filter by. + match_all : bool + If True, items must have ALL listed tags. + If False (default), items with ANY listed tag are returned. + + Returns + ------- + Papers + """ + placeholders = ",".join("?" * len(tags)) + with self._connect() as conn: + rows = conn.execute( + f""" + SELECT it.itemID, COUNT(DISTINCT t.name) as tag_count + FROM itemTags it + JOIN tags t ON it.tagID = t.tagID + WHERE t.name IN ({placeholders}) + GROUP BY it.itemID + """, + tags, + ).fetchall() + + required = len(tags) if match_all else 1 + item_ids = [r[0] for r in rows if r[1] >= required] + return self._build_papers(item_ids) + + def list_collections(self) -> List[str]: + """Return all collection names in the Zotero library, sorted alphabetically. + + Returns + ------- + list of str + Collection names sorted alphabetically. + """ + with self._connect() as conn: + rows = conn.execute( + "SELECT collectionName FROM collections ORDER BY collectionName" + ).fetchall() + return [r[0] for r in rows] + + def list_tags(self) -> List[Dict]: + """Return all tag names with occurrence counts, sorted by count descending. + + Returns + ------- + list of dict + Tags with structure: [{"name": str, "count": int}, ...] sorted by count (descending). + """ + with self._connect() as conn: + rows = conn.execute( + "SELECT name, COUNT(*) as cnt FROM tags GROUP BY name ORDER BY cnt DESC" + ).fetchall() + return [{"name": r[0], "count": r[1]} for r in rows] + + # ── Internal helpers ────────────────────────────────────────────────────── + + def _detect_db_path(self) -> Path: + """Auto-detect Zotero SQLite: Linux first, then WSL Windows mount.""" + if _LINUX_PATH.exists(): + return _LINUX_PATH + if _WSL_BASE.exists(): + for candidate in _WSL_BASE.glob("*/Zotero/zotero.sqlite"): + if candidate.exists(): + return candidate + raise FileNotFoundError( + "No Zotero database found. Checked:\n" + f" {_LINUX_PATH}\n" + f" {_WSL_BASE}/*/Zotero/zotero.sqlite\n" + "Pass db_path explicitly: ZoteroLocalReader(db_path='/path/to/zotero.sqlite')" + ) + + def _connect(self) -> sqlite3.Connection: + """Open a read-only SQLite connection.""" + conn = sqlite3.connect(f"file:{self.db_path}?mode=ro", uri=True) + conn.row_factory = sqlite3.Row + return conn + + def _fetch_item_ids(self, limit: Optional[int] = None) -> List[int]: + """Fetch IDs of all non-attachment, non-note items.""" + skip = ",".join(f"'{t}'" for t in _SKIP_TYPES) + limit_clause = f"LIMIT {limit}" if limit else "" + with self._connect() as conn: + rows = conn.execute( + f""" + SELECT i.itemID + FROM items i + JOIN itemTypes it ON i.itemTypeID = it.itemTypeID + WHERE it.typeName NOT IN ({skip}) + ORDER BY i.itemID + {limit_clause} + """ + ).fetchall() + return [r[0] for r in rows] + + def _build_papers(self, item_ids: List[int]) -> Papers: + """Batch-load all data for the given item IDs and convert to Papers.""" + if not item_ids: + return Papers([], project=self.project) + + ids_str = ",".join(str(i) for i in item_ids) + + with self._connect() as conn: + # Item base info + type_rows = conn.execute( + f""" + SELECT i.itemID, i.key, it.typeName + FROM items i + JOIN itemTypes it ON i.itemTypeID = it.itemTypeID + WHERE i.itemID IN ({ids_str}) + """ + ).fetchall() + + # All field values (batch) + field_rows = conn.execute( + f""" + SELECT id.itemID, f.fieldName, idv.value + FROM itemData id + JOIN fields f ON id.fieldID = f.fieldID + JOIN itemDataValues idv ON id.valueID = idv.valueID + WHERE id.itemID IN ({ids_str}) + """ + ).fetchall() + + # Creators (ordered) + creator_rows = conn.execute( + f""" + SELECT ic.itemID, c.firstName, c.lastName, ct.creatorType + FROM itemCreators ic + JOIN creators c ON ic.creatorID = c.creatorID + JOIN creatorTypes ct ON ic.creatorTypeID = ct.creatorTypeID + WHERE ic.itemID IN ({ids_str}) + ORDER BY ic.itemID, ic.orderIndex + """ + ).fetchall() + + # Tags + tag_rows = conn.execute( + f""" + SELECT it.itemID, t.name + FROM itemTags it + JOIN tags t ON it.tagID = t.tagID + WHERE it.itemID IN ({ids_str}) + """ + ).fetchall() + + # Group by itemID + fields: Dict[int, Dict[str, str]] = {i: {} for i in item_ids} + for row in field_rows: + fields[row[0]][row[1]] = row[2] + + creators: Dict[int, List[dict]] = {i: [] for i in item_ids} + for row in creator_rows: + creators[row[0]].append( + { + "firstName": row[1] or "", + "lastName": row[2] or "", + "creatorType": row[3], + } + ) + + tags: Dict[int, List[str]] = {i: [] for i in item_ids} + for row in tag_rows: + tags[row[0]].append(row[1]) + + # Convert to Papers via ZoteroMapper + paper_list = [] + for row in type_rows: + item_id, key, type_name = row[0], row[1], row[2] + api_dict = self._to_api_format( + key, + type_name, + fields.get(item_id, {}), + creators.get(item_id, []), + tags.get(item_id, []), + ) + try: + paper = self._mapper.zotero_to_paper(api_dict) + paper_list.append(paper) + except Exception: + pass # Skip malformed items silently + + return Papers(paper_list, project=self.project) + + def _to_api_format( + self, + key: str, + type_name: str, + fields: Dict[str, str], + creators: List[dict], + tags: List[str], + ) -> dict: + """Convert raw SQLite rows to the Zotero API dict format ZoteroMapper expects.""" + return { + "key": key, + "version": 0, + "data": { + "itemType": type_name, + "title": fields.get("title", ""), + "abstractNote": fields.get("abstractNote", ""), + "creators": creators, + "date": fields.get("date", ""), + "DOI": fields.get("DOI", ""), + "url": fields.get("url", ""), + "publicationTitle": fields.get("publicationTitle", ""), + "journalAbbreviation": fields.get("journalAbbreviation", ""), + "volume": fields.get("volume", ""), + "issue": fields.get("issue", ""), + "pages": fields.get("pages", ""), + "publisher": fields.get("publisher", ""), + "ISSN": fields.get("ISSN", ""), + "ISBN": fields.get("ISBN", ""), + "extra": fields.get("extra", ""), + "language": fields.get("language", ""), + "tags": [{"tag": t} for t in tags], + "collections": [], + }, + } + + +# ── Convenience export ──────────────────────────────────────────────────────── + + +def export_for_zotero(papers: Papers, path: str | Path, fmt: str = "bibtex") -> Path: + """Export papers to a file that Zotero can import via File > Import. + + Parameters + ---------- + papers : Papers + Papers collection to export. + path : str or Path + Output file path (e.g. 'output.bib', 'output.ris'). + fmt : str + Format: 'bibtex' (default) or 'ris'. + + Returns + ------- + Path + The written file path. + + Example + ------- + >>> reader = ZoteroLocalReader() + >>> papers = reader.read_all() + >>> export_for_zotero(papers, "enriched.bib") + >>> # Then: Zotero > File > Import > enriched.bib + """ + from scitex.scholar.formatting import papers_to_format + + path = Path(path) + + # Convert Papers (which may hold Paper objects) to plain dicts for formatting + paper_dicts = [] + for p in papers: + if hasattr(p, "metadata"): + # Paper object — convert to formatting dict + paper_dicts.append(_paper_obj_to_dict(p)) + elif isinstance(p, dict): + paper_dicts.append(p) + + content = papers_to_format(paper_dicts, fmt) + path.write_text(content, encoding="utf-8") + return path + + +def _paper_obj_to_dict(paper) -> dict: + """Convert Paper object to the plain dict format used by formatting.py.""" + m = paper.metadata + authors_list = getattr(m.basic, "authors", []) or [] + return { + "title": getattr(m.basic, "title", "") or "", + "authors_str": " and ".join(authors_list), + "year": str(getattr(m.basic, "year", "") or ""), + "abstract": getattr(m.basic, "abstract", "") or "", + "journal": getattr(m.publication, "journal", "") or "", + "volume": getattr(m.publication, "volume", "") or "", + "number": getattr(m.publication, "issue", "") or "", + "pages": getattr(m.publication, "pages", "") or "", + "doi": getattr(m.id, "doi", "") or "", + "pmid": getattr(m.id, "pmid", "") or "", + "arxiv_id": getattr(m.id, "arxiv_id", "") or "", + "url": (getattr(m.url, "publisher", "") or getattr(m.url, "doi", "") or ""), + "document_type": getattr(m.basic, "type", "article") or "article", + "is_open_access": False, + "source": "zotero", + } + + +# ── Public API ──────────────────────────────────────────────────────────────── + +__all__ = ["ZoteroLocalReader", "export_for_zotero"] + +# EOF diff --git a/src/scitex/scholar/metadata_engines/utils/__init__.py b/src/scitex/scholar/metadata_engines/utils/__init__.py index 376b61418..af01a431c 100755 --- a/src/scitex/scholar/metadata_engines/utils/__init__.py +++ b/src/scitex/scholar/metadata_engines/utils/__init__.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # Timestamp: "2025-08-04 08:15:00 (ywatanabe)" # File: /home/ywatanabe/proj/scitex_repo/src/scitex/scholar/doi/utils/__init__.py # ---------------------------------------- @@ -12,7 +11,7 @@ # ---------------------------------------- # Import TextNormalizer from central location -from scitex.scholar.utils.text import TextNormalizer +from scitex.scholar._utils.text import TextNormalizer from ._metadata2bibtex import metadata2bibtex from ._PubMedConverter import PubMedConverter, pmid2doi diff --git a/src/scitex/scholar/pipelines/SearchQueryParser.py b/src/scitex/scholar/pipelines/SearchQueryParser.py index e3d96b706..e2617116a 100755 --- a/src/scitex/scholar/pipelines/SearchQueryParser.py +++ b/src/scitex/scholar/pipelines/SearchQueryParser.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # File: ./src/scitex/scholar/pipelines/SearchQueryParser.py """ @@ -53,6 +52,12 @@ def __init__(self, query: str): self.max_citations: Optional[int] = None self.open_access: Optional[bool] = None self.document_type: Optional[str] = None + self.title_includes: List[str] = [] + self.title_excludes: List[str] = [] + self.author_includes: List[str] = [] + self.author_excludes: List[str] = [] + self.journal_includes: List[str] = [] + self.journal_excludes: List[str] = [] self._parse() @@ -161,9 +166,134 @@ def get_filters(self) -> Dict[str, Any]: filters["open_access"] = self.open_access if self.document_type is not None: filters["document_type"] = self.document_type + if self.title_includes: + filters["title_includes"] = self.title_includes + if self.title_excludes: + filters["title_excludes"] = self.title_excludes + if self.author_includes: + filters["author_includes"] = self.author_includes + if self.author_excludes: + filters["author_excludes"] = self.author_excludes + if self.journal_includes: + filters["journal_includes"] = self.journal_includes + if self.journal_excludes: + filters["journal_excludes"] = self.journal_excludes return filters + @classmethod + def from_shell_syntax(cls, query: str) -> "SearchQueryParser": + """Parse shell-style operators from a query string. + + Supports the following shell-style operators: + -t VALUE or --title VALUE : Title include filter + -t -VALUE : Title exclude filter (- prefix on value) + -a VALUE or --author VALUE: Author include filter + -a -VALUE : Author exclude filter + -j VALUE or --journal VALUE: Journal include filter + -j -VALUE : Journal exclude filter + -ymin YYYY or --year-min YYYY: Minimum year + -ymax YYYY or --year-max YYYY: Maximum year + -cmin N or --citations-min N : Minimum citations + -cmax N or --citations-max N : Maximum citations + -ifmin N or --if-min N : Minimum impact factor + -ifmax N or --if-max N : Maximum impact factor + + Args: + query: Query string with shell-style operators + + Returns + ------- + SearchQueryParser instance with parsed fields set + + Example: + parser = SearchQueryParser.from_shell_syntax( + "hippocampus -t theta -a -Smith -ymin 2020 -cmin 50" + ) + """ + # Create instance without running the standard _parse() on the raw query. + # We do this by initialising with an empty string and then setting + # original_query and the parsed fields manually. + instance = cls.__new__(cls) + instance.original_query = query + instance.positive_keywords = [] + instance.negative_keywords = [] + instance.year_start = None + instance.year_end = None + instance.min_impact_factor = None + instance.max_impact_factor = None + instance.min_citations = None + instance.max_citations = None + instance.open_access = None + instance.document_type = None + instance.title_includes = [] + instance.title_excludes = [] + instance.author_includes = [] + instance.author_excludes = [] + instance.journal_includes = [] + instance.journal_excludes = [] + + if not query: + return instance + + remaining = query + + # Text filters: -t/-a/-j (value may be prefixed with - for exclude) + text_patterns = [ + (r'(?:-t|--title)\s+(-?)([^\s]+|"[^"]+"|\'[^\']+\')', "title"), + (r'(?:-a|--author)\s+(-?)([^\s]+|"[^"]+"|\'[^\']+\')', "author"), + (r'(?:-j|--journal)\s+(-?)([^\s]+|"[^"]+"|\'[^\']+\')', "journal"), + ] + + for pattern, field_name in text_patterns: + for match in re.finditer(pattern, remaining, re.IGNORECASE): + is_exclude = match.group(1) == "-" + value = match.group(2).strip("\"'") + if is_exclude: + getattr(instance, f"{field_name}_excludes").append(value) + else: + getattr(instance, f"{field_name}_includes").append(value) + remaining = re.sub(pattern, "", remaining, flags=re.IGNORECASE) + + # Numeric filters + numeric_patterns = [ + (r"(?:-ymin|--year-min)\s+(\d{4})", "year_min"), + (r"(?:-ymax|--year-max)\s+(\d{4})", "year_max"), + (r"(?:-cmin|--citations-min)\s+(\d+)", "citations_min"), + (r"(?:-cmax|--citations-max)\s+(\d+)", "citations_max"), + (r"(?:-ifmin|--if-min)\s+(\d+(?:\.\d+)?)", "impact_factor_min"), + (r"(?:-ifmax|--if-max)\s+(\d+(?:\.\d+)?)", "impact_factor_max"), + ] + + field_mapping = { + "year_min": "year_start", + "year_max": "year_end", + "citations_min": "min_citations", + "citations_max": "max_citations", + "impact_factor_min": "min_impact_factor", + "impact_factor_max": "max_impact_factor", + } + + for pattern, field_name in numeric_patterns: + match = re.search(pattern, remaining, re.IGNORECASE) + if match: + raw_value = match.group(1) + if "impact_factor" in field_name: + value = float(raw_value) + elif "year" in field_name: + value = int(raw_value) + else: + value = int(raw_value) + attr_name = field_mapping[field_name] + setattr(instance, attr_name, value) + remaining = re.sub(pattern, "", remaining, flags=re.IGNORECASE) + + # Remaining text becomes positive keywords + words = remaining.split() + instance.positive_keywords = [w.strip() for w in words if w.strip()] + + return instance + def get_api_filters(self) -> Dict[str, Any]: """Get filters that can be pushed to API level.""" api_filters = {} diff --git a/src/scitex/scholar/storage/_LibraryManager.py b/src/scitex/scholar/storage/_LibraryManager.py index eac2073e4..2288f540a 100755 --- a/src/scitex/scholar/storage/_LibraryManager.py +++ b/src/scitex/scholar/storage/_LibraryManager.py @@ -24,6 +24,7 @@ from __future__ import annotations +from pathlib import Path from typing import Optional from scitex import logging @@ -111,12 +112,22 @@ def __init__( project: str = None, single_doi_resolver=None, config: Optional[ScholarConfig] = None, + project_dir=None, ): - """Initialize library manager.""" + """Initialize library manager. + + Parameters + ---------- + project_dir : str or Path, optional + Root of the user's code project (e.g. ``~/my-project``). + When provided, project-local symlinks are also created at + ``{project_dir}/scitex/scholar/library/{project}/``. + """ self.config = config or ScholarConfig() self.project = self.config.resolve("project", project) self.library_master_dir = self.config.path_manager.get_library_master_dir() self.single_doi_resolver = single_doi_resolver + self.project_dir = Path(project_dir) if project_dir else None self._source_filename = "papers" self.dedup_manager = DeduplicationManager(config=self.config) diff --git a/src/scitex/scholar/storage/__init__.py b/src/scitex/scholar/storage/__init__.py index 3153ac2d7..7f1a15c76 100755 --- a/src/scitex/scholar/storage/__init__.py +++ b/src/scitex/scholar/storage/__init__.py @@ -24,6 +24,7 @@ ) from ._LibraryCacheManager import LibraryCacheManager from ._LibraryManager import LibraryManager +from ._search_filename import normalize_search_filename from .BibTeXHandler import BibTeXHandler from .PaperIO import PaperIO from .ScholarLibrary import ScholarLibrary @@ -39,4 +40,5 @@ "validate_bibtex_file", "validate_bibtex_content", "PaperIO", + "normalize_search_filename", ] diff --git a/src/scitex/scholar/storage/_mixins/_paper_saving.py b/src/scitex/scholar/storage/_mixins/_paper_saving.py index ef49f735d..c364c7903 100755 --- a/src/scitex/scholar/storage/_mixins/_paper_saving.py +++ b/src/scitex/scholar/storage/_mixins/_paper_saving.py @@ -194,7 +194,7 @@ def save_resolved_paper( logger.success(f"Saved paper to MASTER Scholar library: {paper_id}") - # Create project symlink if needed + # Create project symlinks if needed if self.project and self.project not in ["master", "MASTER"]: try: readable_name = self._generate_readable_name( @@ -204,11 +204,17 @@ def save_resolved_paper( year=year, journal=journal, ) + # ~/.scitex/scholar/library/{project}/ view self._create_project_symlink( master_storage_path=master_storage_path, project=self.project, readable_name=readable_name, ) + # {project_dir}/scitex/scholar/library/{project}/ view + self._create_project_local_symlink( + master_storage_path=master_storage_path, + readable_name=readable_name, + ) except Exception as exc_: logger.error(f"Failed to create symlink for {paper_id}: {exc_}") diff --git a/src/scitex/scholar/storage/_mixins/_symlink_handlers.py b/src/scitex/scholar/storage/_mixins/_symlink_handlers.py index 433a74efb..fdcc5cf21 100755 --- a/src/scitex/scholar/storage/_mixins/_symlink_handlers.py +++ b/src/scitex/scholar/storage/_mixins/_symlink_handlers.py @@ -188,6 +188,66 @@ def _create_project_symlink( logger.warning(f"Failed to create project symlink: {exc_}") return None + def _create_project_local_symlink( + self, + master_storage_path: Path, + readable_name: str, + ) -> Optional[Path]: + """Create symlink inside the project's own directory tree. + + Target location: ``{project_dir}/scitex/scholar/library/{project}/{readable_name}`` + Target of symlink: absolute path to master storage entry. + + This mirrors the ``~/.scitex/scholar/library/{project}/`` view directly + inside the user's code project so papers are visible alongside source code. + + Args: + master_storage_path: Absolute path to the MASTER entry directory. + readable_name: Human-readable symlink name (PDF-xx_CC-... format). + + Returns + ------- + Path to the created symlink, or None on failure. + """ + if not getattr(self, "project_dir", None): + return None + if not self.project or self.project in ("master", "MASTER"): + return None + + try: + local_lib = ( + Path(self.project_dir) / "scitex" / "scholar" / "library" / self.project + ) + local_lib.mkdir(parents=True, exist_ok=True) + + symlink_path = local_lib / readable_name + + # Remove stale symlinks pointing to the same master entry + master_id = master_storage_path.name + for existing in local_lib.iterdir(): + if not existing.is_symlink(): + continue + try: + if ( + existing.resolve().name == master_id + and existing.name != readable_name + ): + existing.unlink() + except Exception: + pass + + if not symlink_path.exists(): + # Use absolute path — relative would break across project moves + symlink_path.symlink_to(master_storage_path.resolve()) + logger.success( + f"Created project-local symlink: {symlink_path} -> {master_storage_path}" + ) + return symlink_path + + except Exception as exc_: + logger.warning(f"Failed to create project-local symlink: {exc_}") + return None + def _ensure_project_symlink( self, title: str, diff --git a/src/scitex/scholar/storage/_search_filename.py b/src/scitex/scholar/storage/_search_filename.py new file mode 100755 index 000000000..8d0d6082d --- /dev/null +++ b/src/scitex/scholar/storage/_search_filename.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +# File: src/scitex/scholar/storage/_search_filename.py + +"""Generate normalized filenames for saved search results. + +Format: ``-.`` + +Example:: + + from scitex.scholar.storage import normalize_search_filename + + fname = normalize_search_filename("hippocampus theta year:2020-2024") + # -> "20260218-083000-hippocampus-theta-2020-2024.bib" +""" + +import re +from datetime import datetime + + +def normalize_search_filename(query: str, extension: str = ".bib") -> str: + """Generate a timestamped, normalized filename from a search query. + + Encodes positive keywords and active filters using hyphens. + Timestamp prefix ensures files sort chronologically. + + Args: + query: Raw search query string (colon-syntax or plain keywords). + extension: File extension to append (default: '.bib'). + + Returns + ------- + Filename string, e.g. + ``20260218-083000-hippocampus-theta-2020-2024.bib`` + + Examples + -------- + >>> normalize_search_filename("hippocampus sharp wave year:2020-2024") # doctest: +ELLIPSIS + '...-hippocampus-sharp-wave-2020-2024.bib' + + >>> normalize_search_filename("neural network if:>5") # doctest: +ELLIPSIS + '...-neural-network-if5.bib' + """ + # Import here to avoid circular dependency (storage <- pipelines <- storage) + from ..pipelines.SearchQueryParser import SearchQueryParser + + parser = SearchQueryParser(query) if query else None + + parts = [] + + if parser: + # Keywords (positive only) + for kw in parser.positive_keywords: + safe = re.sub(r"[^a-z0-9]+", "-", kw.lower()).strip("-") + if safe: + parts.append(safe) + + # Year range + if parser.year_start and parser.year_end: + parts.append(f"{parser.year_start}-{parser.year_end}") + elif parser.year_start: + parts.append(f"from{parser.year_start}") + elif parser.year_end: + parts.append(f"to{parser.year_end}") + + # Impact factor + if parser.min_impact_factor is not None: + val = ( + int(parser.min_impact_factor) + if parser.min_impact_factor == int(parser.min_impact_factor) + else parser.min_impact_factor + ) + parts.append(f"if{val}") + + # Citations + if parser.min_citations is not None: + parts.append(f"c{parser.min_citations}") + + # Open access + if parser.open_access: + parts.append("oa") + + # Document type + if parser.document_type: + parts.append(parser.document_type) + + stem = "-".join(parts) if parts else "search" + stem = re.sub(r"-+", "-", stem).strip("-") + + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + ext = extension if extension.startswith(".") else f".{extension}" + return f"{timestamp}-{stem}{ext}" diff --git a/src/scitex/types/_ArrayLike.py b/src/scitex/types/_ArrayLike.py index 2f9619110..1636243cf 100755 --- a/src/scitex/types/_ArrayLike.py +++ b/src/scitex/types/_ArrayLike.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # Timestamp: "2025-05-01 09:21:23 (ywatanabe)" # File: /home/ywatanabe/proj/scitex_repo/src/scitex/types/_ArrayLike.py # ---------------------------------------- @@ -15,7 +14,14 @@ import numpy as _np import pandas as _pd -import xarray as _xr + +try: + import xarray as _xr + + XARRAY_AVAILABLE = True +except ImportError: + XARRAY_AVAILABLE = False + _xr = None def _get_torch_tensor_type(): @@ -29,29 +35,38 @@ def _get_torch_tensor_type(): return type(None) -ArrayLike = _Union[ - _List, - _Tuple, - _np.ndarray, - _pd.Series, - _pd.DataFrame, - _xr.DataArray, -] +if XARRAY_AVAILABLE: + ArrayLike = _Union[ + _List, + _Tuple, + _np.ndarray, + _pd.Series, + _pd.DataFrame, + _xr.DataArray, + ] +else: + ArrayLike = _Union[ + _List, + _Tuple, + _np.ndarray, + _pd.Series, + _pd.DataFrame, + ] def is_array_like(obj) -> bool: """Check if object is array-like. - Returns: + Returns + ------- bool: True if object is array-like, False otherwise. """ # First check against non-torch types - is_standard_array = isinstance( - obj, - (_List, _Tuple, _np.ndarray, _pd.Series, _pd.DataFrame, _xr.DataArray), - ) + base_types = [_List, _Tuple, _np.ndarray, _pd.Series, _pd.DataFrame] + if XARRAY_AVAILABLE: + base_types.append(_xr.DataArray) - if is_standard_array: + if isinstance(obj, tuple(base_types)): return True # Check torch tensor lazily to avoid circular imports diff --git a/src/scitex/writer/__init__.py b/src/scitex/writer/__init__.py index f2e8a6284..1c247afdc 100755 --- a/src/scitex/writer/__init__.py +++ b/src/scitex/writer/__init__.py @@ -58,6 +58,7 @@ bib, compile, ensure_workspace, + export, figures, guidelines, project, @@ -97,6 +98,7 @@ def __getattr__(self, name): bib = None compile = None ensure_workspace = None + export = None figures = None guidelines = None project = None @@ -131,6 +133,7 @@ def has_writer() -> bool: # Modules "bib", "compile", + "export", "figures", "guidelines", "project", diff --git a/tests/scitex/scholar/integration/zotero/test_local_reader.py b/tests/scitex/scholar/integration/zotero/test_local_reader.py new file mode 100755 index 000000000..d43a65482 --- /dev/null +++ b/tests/scitex/scholar/integration/zotero/test_local_reader.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +"""Tests for ZoteroLocalReader and export_for_zotero. + +Tests use the actual ~/Zotero/zotero.sqlite when present. +All tests skip gracefully when no local Zotero database is found. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +_LINUX_DB = Path("~/Zotero/zotero.sqlite").expanduser() +_WINDOWS_DB = Path("/mnt/c/Users/wyusu/Zotero/zotero.sqlite") + +pytestmark = pytest.mark.skipif( + not _LINUX_DB.exists(), + reason="No local Zotero database at ~/Zotero/zotero.sqlite", +) + + +# ── Fixtures ────────────────────────────────────────────────────────────────── + + +@pytest.fixture(scope="module") +def reader(): + from scitex.scholar.integration.zotero import ZoteroLocalReader + + return ZoteroLocalReader() + + +@pytest.fixture(scope="module") +def all_papers(reader): + return reader.read_all() + + +# ── Path detection ──────────────────────────────────────────────────────────── + + +def test_detect_db_path_linux(): + from scitex.scholar.integration.zotero import ZoteroLocalReader + + r = ZoteroLocalReader() + assert r.db_path.exists() + assert r.db_path.suffix == ".sqlite" + + +def test_explicit_db_path(): + from scitex.scholar.integration.zotero import ZoteroLocalReader + + r = ZoteroLocalReader(db_path=str(_LINUX_DB)) + assert r.db_path == _LINUX_DB + + +# ── read_all ───────────────────────────────────────────────────────────────── + + +def test_read_all_returns_papers(all_papers): + from scitex.scholar.core.Papers import Papers + + assert isinstance(all_papers, Papers) + + +def test_read_all_count(all_papers): + # Linux DB has 49 items — at least a few must load + assert len(all_papers) >= 1 + + +def test_read_all_titles_not_empty(all_papers): + titles = [p.metadata.basic.title for p in all_papers if p.metadata.basic.title] + assert len(titles) >= 1 + + +def test_read_all_has_authors(all_papers): + papers_with_authors = [p for p in all_papers if p.metadata.basic.authors] + assert len(papers_with_authors) >= 1 + + +def test_read_all_with_limit(reader): + papers = reader.read_all(limit=3) + assert len(papers) <= 3 + + +# ── read_by_tags ────────────────────────────────────────────────────────────── + + +def test_read_by_tags_returns_subset(reader, all_papers): + # "Epilepsy" tag is known to exist in the Linux DB + epilepsy_papers = reader.read_by_tags(["Epilepsy"]) + assert len(epilepsy_papers) >= 1 + assert len(epilepsy_papers) <= len(all_papers) + + +def test_read_by_tags_any(reader): + # OR logic: items with either tag + papers = reader.read_by_tags(["Epilepsy", "EEG"], match_all=False) + assert len(papers) >= 1 + + +def test_read_by_tags_all(reader): + # AND logic: items with BOTH tags (may be 0 if no overlap) + papers = reader.read_by_tags(["Epilepsy", "EEG"], match_all=True) + assert isinstance(papers.papers, list) # result is valid, even if empty + + +def test_read_by_tags_nonexistent(reader): + papers = reader.read_by_tags(["NonExistentTag_XYZ_999"]) + assert len(papers) == 0 + + +# ── read_by_collection ──────────────────────────────────────────────────────── + + +def test_read_by_collection_nonexistent(reader): + # Linux DB has 0 collections; should return empty Papers, not raise + papers = reader.read_by_collection("NonExistentCollection") + assert len(papers) == 0 + + +# ── export_for_zotero ───────────────────────────────────────────────────────── + + +def test_export_for_zotero_bibtex(all_papers, tmp_path): + from scitex.scholar.integration.zotero import export_for_zotero + + out = tmp_path / "export.bib" + result = export_for_zotero(all_papers, out, fmt="bibtex") + + assert result == out + assert out.exists() + content = out.read_text() + assert "@" in content # at least one BibTeX entry + + +def test_export_for_zotero_ris(all_papers, tmp_path): + from scitex.scholar.integration.zotero import export_for_zotero + + out = tmp_path / "export.ris" + result = export_for_zotero(all_papers, out, fmt="ris") + + assert result == out + assert out.exists() + content = out.read_text() + assert "TY -" in content # at least one RIS entry + + +def test_export_roundtrip_titles(all_papers, tmp_path): + """Titles present in Papers appear in BibTeX output.""" + from scitex.scholar.integration.zotero import export_for_zotero + + out = tmp_path / "roundtrip.bib" + export_for_zotero(all_papers, out, fmt="bibtex") + + content = out.read_text() + titles = [p.metadata.basic.title for p in all_papers if p.metadata.basic.title] + # At least one title should appear (partially) in the output + assert any(t[:20] in content for t in titles if len(t) >= 20) + + +# ── Windows WSL path ────────────────────────────────────────────────────────── + + +@pytest.mark.skipif( + not _WINDOWS_DB.exists(), + reason="Windows Zotero DB not accessible at /mnt/c/Users/wyusu/Zotero/", +) +def test_windows_db_read(): + from scitex.scholar.integration.zotero import ZoteroLocalReader + + r = ZoteroLocalReader(db_path=_WINDOWS_DB) + papers = r.read_all(limit=10) + assert len(papers) >= 1 + assert len(papers) <= 10 + + +# EOF diff --git a/tests/scitex/scholar/storage/test__search_filename_and_symlink.py b/tests/scitex/scholar/storage/test__search_filename_and_symlink.py new file mode 100755 index 000000000..39b886580 --- /dev/null +++ b/tests/scitex/scholar/storage/test__search_filename_and_symlink.py @@ -0,0 +1,809 @@ +#!/usr/bin/env python3 +# Timestamp: "2026-02-18" +# File: tests/scitex/scholar/storage/test__search_filename_and_symlink.py +# ---------------------------------------- + +""" +Comprehensive tests for normalize_search_filename and _create_project_local_symlink. + +Feature 1: normalize_search_filename +- Generates timestamped filenames from search queries +- Format: YYYYMMDD-HHMMSS-{normalized-query}.{ext} +- Uses SearchQueryParser to extract filters + +Feature 2: _create_project_local_symlink +- Creates symlinks at {project_dir}/scitex/scholar/library/{project}/{readable_name} +- Symlink target is absolute path to master_storage_path +- Removes stale symlinks pointing to same master entry with different names +""" + +import importlib.util +import json +import os +import sys +from datetime import datetime +from pathlib import Path +from unittest.mock import MagicMock, Mock, patch + +import pytest + +# ============================================================================ +# Module Loading Helpers +# ============================================================================ + + +def load_module(name, path): + """Load a module from file path using importlib.util.""" + spec = importlib.util.spec_from_file_location(name, path) + if spec is None or spec.loader is None: + raise ImportError(f"Cannot load spec for {name} from {path}") + mod = importlib.util.module_from_spec(spec) + sys.modules[name] = mod + spec.loader.exec_module(mod) + return mod + + +PROJECT_ROOT = Path( + __file__ +).parent.parent.parent.parent.parent # tests/scitex/scholar/storage/ -> project root + + +@pytest.fixture(scope="session") +def search_query_parser_module(): + """Load SearchQueryParser module once per session.""" + module_path = PROJECT_ROOT / "src/scitex/scholar/pipelines/SearchQueryParser.py" + return load_module("scitex.scholar.pipelines.SearchQueryParser", str(module_path)) + + +@pytest.fixture(scope="session") +def search_filename_module(): + """Load _search_filename module once per session.""" + module_path = PROJECT_ROOT / "src/scitex/scholar/storage/_search_filename.py" + return load_module("scitex.scholar.storage._search_filename", str(module_path)) + + +@pytest.fixture(scope="session") +def symlink_handlers_module(): + """Load _symlink_handlers module once per session (avoids full scitex import chain).""" + module_path = ( + PROJECT_ROOT / "src/scitex/scholar/storage/_mixins/_symlink_handlers.py" + ) + return load_module( + "scitex.scholar.storage._mixins._symlink_handlers", str(module_path) + ) + + +@pytest.fixture(scope="session") +def SymlinkHandlersMixin(symlink_handlers_module): + """Get the SymlinkHandlersMixin class.""" + return symlink_handlers_module.SymlinkHandlersMixin + + +@pytest.fixture +def normalize_search_filename(search_filename_module): + """Get the normalize_search_filename function.""" + return search_filename_module.normalize_search_filename + + +@pytest.fixture +def SearchQueryParser(search_query_parser_module): + """Get the SearchQueryParser class.""" + return search_query_parser_module.SearchQueryParser + + +# ============================================================================ +# Feature 1: normalize_search_filename Tests +# ============================================================================ + + +class TestNormalizeSearchFilenameBasics: + """Test basic functionality of normalize_search_filename.""" + + def test_empty_query_returns_search_bib(self, normalize_search_filename): + """Empty query should return filename with 'search' as stem.""" + result = normalize_search_filename("") + # Format: YYYYMMDD-HHMMSS-search.bib + assert result.endswith("-search.bib") + # Check timestamp prefix (YYYYMMDD-HHMMSS) + parts = result.split("-") + assert len(parts) >= 3 + assert len(parts[0]) == 8 # YYYYMMDD + assert len(parts[1]) == 6 # HHMMSS + + def test_simple_keywords_with_hyphens(self, normalize_search_filename): + """Simple keywords should be joined with hyphens.""" + result = normalize_search_filename("hippocampus theta") + assert "hippocampus-theta" in result + assert ".bib" in result + # Should NOT have underscores + assert "_" not in result.split("-search")[0] + + def test_keywords_converted_to_lowercase(self, normalize_search_filename): + """Keywords should be converted to lowercase.""" + result = normalize_search_filename("HIPPOCAMPUS Sharp WAVE") + assert "hippocampus-sharp-wave" in result + # Verify no uppercase letters in the normalized part + stem = ( + result.split("-search")[0] if "-search" in result else result.split(".")[0] + ) + normalized_part = "-".join(stem.split("-")[2:]) # Skip timestamp + assert normalized_part.islower() + + def test_special_characters_removed(self, normalize_search_filename): + """Special characters should be removed, words joined with hyphens.""" + result = normalize_search_filename("neural@network signal-processing") + # special chars removed, hyphens preserved for word separation + assert "-" in result + assert "@" not in result + + def test_custom_extension(self, normalize_search_filename): + """Should support custom file extensions.""" + result_json = normalize_search_filename("test query", extension=".json") + assert result_json.endswith(".json") + + result_csv = normalize_search_filename("test query", extension="csv") + assert result_csv.endswith(".csv") + + result_txt = normalize_search_filename("test query", extension="txt") + assert result_txt.endswith(".txt") + + def test_extension_format_normalization(self, normalize_search_filename): + """Extension should work with or without leading dot.""" + result_with_dot = normalize_search_filename("query", extension=".bib") + result_without_dot = normalize_search_filename("query", extension="bib") + + # Both should end with .bib + assert result_with_dot.endswith(".bib") + assert result_without_dot.endswith(".bib") + + +class TestNormalizeSearchFilenameTimestamp: + """Test timestamp generation in normalize_search_filename.""" + + def test_timestamp_format_yyyymmdd_hhmmss(self, normalize_search_filename): + """Timestamp should be YYYYMMDD-HHMMSS format.""" + result = normalize_search_filename("test") + # Extract timestamp (first two hyphen-separated parts) + parts = result.split("-", 2) + assert len(parts) >= 2 + + date_part = parts[0] + time_part = parts[1] + + # Check date format (YYYYMMDD) + assert len(date_part) == 8 + assert date_part.isdigit() + year = int(date_part[:4]) + month = int(date_part[4:6]) + day = int(date_part[6:8]) + assert 2000 <= year <= 2100 + assert 1 <= month <= 12 + assert 1 <= day <= 31 + + # Check time format (HHMMSS) + assert len(time_part) == 6 + assert time_part.isdigit() + hour = int(time_part[:2]) + minute = int(time_part[2:4]) + second = int(time_part[4:6]) + assert 0 <= hour <= 23 + assert 0 <= minute <= 59 + assert 0 <= second <= 59 + + def test_timestamp_is_reasonable(self, normalize_search_filename): + """Timestamp should be close to current time.""" + before = datetime.now() + result = normalize_search_filename("query") + after = datetime.now() + + # Extract timestamp + timestamp_str = result.split("-", 2)[0] + "-" + result.split("-", 2)[1] + timestamp = datetime.strptime(timestamp_str, "%Y%m%d-%H%M%S") + + # Timestamp should be within a reasonable range (strip microseconds; strptime gives second precision) + assert before.replace(microsecond=0) <= timestamp <= after + + +class TestNormalizeSearchFilenameFilters: + """Test filter encoding in normalize_search_filename.""" + + def test_year_range_encoding(self, normalize_search_filename): + """Year range should be encoded as YYYY-YYYY.""" + result = normalize_search_filename("query year:2020-2024") + assert "2020-2024" in result + + def test_year_start_only_encoding(self, normalize_search_filename): + """Year start only should be encoded as from{YYYY}.""" + result = normalize_search_filename("query year:>2020") + assert "from2020" in result + + def test_year_end_only_encoding(self, normalize_search_filename): + """Year end only should be encoded as to{YYYY}.""" + result = normalize_search_filename("query year:<2024") + assert "to2024" in result + + def test_impact_factor_encoding(self, normalize_search_filename): + """Impact factor should be encoded as if{value}.""" + result = normalize_search_filename("query if:>5") + assert "if5" in result + + result_decimal = normalize_search_filename("query if:>5.5") + assert "if5.5" in result_decimal + + def test_citation_count_encoding(self, normalize_search_filename): + """Citation count should be encoded as c{count}.""" + result = normalize_search_filename("query citations:>100") + assert "c100" in result + + result_alt = normalize_search_filename("query citation:>50") + assert "c50" in result_alt + + def test_open_access_encoding(self, normalize_search_filename): + """Open access should be encoded as 'oa'.""" + result = normalize_search_filename("query open_access:true") + assert "oa" in result + + result_alt = normalize_search_filename("query oa:yes") + assert "oa" in result_alt + + def test_document_type_encoding(self, normalize_search_filename): + """Document type should be appended to filename.""" + result = normalize_search_filename("query type:article") + assert "article" in result + + result_review = normalize_search_filename("query type:review") + assert "review" in result_review + + def test_complex_query_with_multiple_filters(self, normalize_search_filename): + """Complex query should encode all filters correctly.""" + result = normalize_search_filename( + "hippocampus neural network year:2020-2024 if:>5 citations:>100 oa:true type:article" + ) + # Should contain all key elements + assert "hippocampus" in result + assert "neural-network" in result + assert "2020-2024" in result + assert "if5" in result + assert "c100" in result + assert "oa" in result + assert "article" in result + + +class TestNormalizeSearchFilenameMixins: + """Test mixin behavior and edge cases.""" + + def test_multiple_spaces_collapsed(self, normalize_search_filename): + """Multiple spaces should be handled gracefully.""" + result1 = normalize_search_filename("keyword1 keyword2") + result2 = normalize_search_filename("keyword1 keyword2") + # Both should produce same normalized part + assert ( + result1.split(".")[0].split("-")[-2:] + == result2.split(".")[0].split("-")[-2:] + ) + + def test_leading_trailing_whitespace_ignored(self, normalize_search_filename): + """Leading and trailing whitespace should be ignored.""" + result1 = normalize_search_filename(" query ") + result2 = normalize_search_filename("query") + # Normalized parts should match + assert ( + result1.split(".")[0].split("-")[-1] == result2.split(".")[0].split("-")[-1] + ) + + def test_quoted_phrases_treated_as_single_keyword(self, normalize_search_filename): + """Quoted phrases should be treated as single keywords with hyphens.""" + result = normalize_search_filename('"sharp wave" ripple') + # Phrase should be present (possibly hyphenated) + assert "sharp" in result + assert "wave" in result + assert "ripple" in result + + def test_negative_keywords_excluded(self, normalize_search_filename): + """Negative keywords (prefixed with -) should not appear in filename.""" + result = normalize_search_filename("hippocampus -seizure -epilepsy") + assert "hippocampus" in result + # Negative keywords should not be in the filename + assert "seizure" not in result + assert "epilepsy" not in result + + def test_hyphen_collapsing(self, normalize_search_filename): + """Multiple consecutive hyphens should be collapsed.""" + # This tests the internal regex cleanup + result = normalize_search_filename("query") + # Should have only single hyphens between components + timestamp_sep = result.count("---") + # Should not have triple hyphens in the normalized part + assert timestamp_sep == 0 + + def test_leading_trailing_hyphens_stripped(self, normalize_search_filename): + """Leading and trailing hyphens in normalized part should be stripped.""" + result = normalize_search_filename("test") + # Split on dots to get filename without extension + filename = result.split(".")[0] + # Extract normalized part (skip YYYYMMDD-HHMMSS-) + normalized = "-".join(filename.split("-")[2:]) + # Should not start or end with hyphen + assert not normalized.startswith("-") + assert not normalized.endswith("-") + + +# ============================================================================ +# Feature 2: _create_project_local_symlink Tests +# ============================================================================ + + +class TestCreateProjectLocalSymlinkBasics: + """Test basic symlink creation functionality.""" + + @pytest.fixture + def mixin_instance(self, tmp_path, SymlinkHandlersMixin): + """Create a minimal instance with the mixin.""" + + class FakeLibraryManager(SymlinkHandlersMixin): + def __init__(self, project=None, project_dir=None): + self.project = project + self.project_dir = project_dir + + return FakeLibraryManager() + + def test_returns_none_when_project_dir_not_set(self, mixin_instance, tmp_path): + """Should return None if project_dir is not set.""" + mixin_instance.project = "test_project" + mixin_instance.project_dir = None + + master_path = tmp_path / "master" / "ABC123" + master_path.mkdir(parents=True) + + result = mixin_instance._create_project_local_symlink( + master_path, "readable_name" + ) + assert result is None + + def test_returns_none_for_master_project(self, mixin_instance, tmp_path): + """Should return None when project is 'master'.""" + mixin_instance.project = "master" + mixin_instance.project_dir = tmp_path / "project" + mixin_instance.project_dir.mkdir(parents=True) + + master_path = tmp_path / "master" / "ABC123" + master_path.mkdir(parents=True) + + result = mixin_instance._create_project_local_symlink( + master_path, "readable_name" + ) + assert result is None + + def test_returns_none_for_master_uppercase(self, mixin_instance, tmp_path): + """Should return None when project is 'MASTER' (uppercase).""" + mixin_instance.project = "MASTER" + mixin_instance.project_dir = tmp_path / "project" + mixin_instance.project_dir.mkdir(parents=True) + + master_path = tmp_path / "master" / "ABC123" + master_path.mkdir(parents=True) + + result = mixin_instance._create_project_local_symlink( + master_path, "readable_name" + ) + assert result is None + + def test_creates_symlink_at_correct_path(self, mixin_instance, tmp_path): + """Should create symlink at {project_dir}/scitex/scholar/library/{project}/{readable_name}.""" + mixin_instance.project = "test_project" + mixin_instance.project_dir = tmp_path / "project" + + master_path = tmp_path / "master" / "ABC123" + master_path.mkdir(parents=True) + + readable_name = "PDF-01_CC-000100_IF-005_2024_Smith_Nature" + result = mixin_instance._create_project_local_symlink( + master_path, readable_name + ) + + # Check that symlink was created at expected location + expected_path = ( + tmp_path + / "project" + / "scitex" + / "scholar" + / "library" + / "test_project" + / readable_name + ) + assert result == expected_path + assert expected_path.exists() + assert expected_path.is_symlink() + + def test_symlink_target_is_absolute(self, mixin_instance, tmp_path): + """Symlink target should be absolute path to master storage.""" + mixin_instance.project = "test_project" + mixin_instance.project_dir = tmp_path / "project" + + master_path = tmp_path / "master" / "ABC123" + master_path.mkdir(parents=True) + + readable_name = "Paper_Name" + symlink_path = mixin_instance._create_project_local_symlink( + master_path, readable_name + ) + + # Resolve symlink target + target = symlink_path.resolve() + + # Target should be the absolute path to master storage + assert target == master_path.resolve() + assert target.is_absolute() + + def test_creates_parent_directories(self, mixin_instance, tmp_path): + """Should create parent directories if they don't exist.""" + mixin_instance.project = "new_project" + mixin_instance.project_dir = tmp_path / "project" + + master_path = tmp_path / "master" / "DEF456" + master_path.mkdir(parents=True) + + result = mixin_instance._create_project_local_symlink(master_path, "paper_name") + + # Check that parent directory structure was created + parent_dir = ( + tmp_path / "project" / "scitex" / "scholar" / "library" / "new_project" + ) + assert parent_dir.exists() + assert parent_dir.is_dir() + + +class TestCreateProjectLocalSymlinkStaleSymlinks: + """Test stale symlink removal functionality.""" + + @pytest.fixture + def mixin_instance(self, tmp_path, SymlinkHandlersMixin): + """Create a minimal instance with the mixin.""" + + class FakeLibraryManager(SymlinkHandlersMixin): + def __init__(self, project=None, project_dir=None): + self.project = project + self.project_dir = project_dir + + return FakeLibraryManager() + + def test_removes_stale_symlink_same_master_different_name( + self, mixin_instance, tmp_path + ): + """Should remove stale symlink pointing to same master entry with different name.""" + mixin_instance.project = "test_project" + mixin_instance.project_dir = tmp_path / "project" + + master_path = tmp_path / "master" / "ABC123" + master_path.mkdir(parents=True) + + # Create directory for symlinks + symlink_dir = ( + tmp_path / "project" / "scitex" / "scholar" / "library" / "test_project" + ) + symlink_dir.mkdir(parents=True) + + # Create a stale symlink with different name pointing to same master + old_symlink = symlink_dir / "Old_Paper_Name" + old_symlink.symlink_to(master_path.resolve()) + + assert old_symlink.exists() + + # Create new symlink with different name, same master target + new_name = "New_Paper_Name" + result = mixin_instance._create_project_local_symlink(master_path, new_name) + + # Old symlink should be removed + assert not old_symlink.exists() + + # New symlink should exist + assert result.exists() + assert result.name == new_name + + def test_preserves_symlink_with_same_name(self, mixin_instance, tmp_path): + """Should not remove symlink if name matches.""" + mixin_instance.project = "test_project" + mixin_instance.project_dir = tmp_path / "project" + + master_path = tmp_path / "master" / "ABC123" + master_path.mkdir(parents=True) + + symlink_dir = ( + tmp_path / "project" / "scitex" / "scholar" / "library" / "test_project" + ) + symlink_dir.mkdir(parents=True) + + readable_name = "PDF-01_CC-000100_IF-005_2024_Smith_Nature" + symlink_path = symlink_dir / readable_name + symlink_path.symlink_to(master_path.resolve()) + + original_inode = symlink_path.lstat().st_ino + + # Call method with same name + result = mixin_instance._create_project_local_symlink( + master_path, readable_name + ) + + # Symlink should still exist (not removed) + assert result.exists() + # Should point to same target + assert result.resolve() == master_path.resolve() + + def test_ignores_non_symlink_files(self, mixin_instance, tmp_path): + """Should ignore non-symlink files in directory.""" + mixin_instance.project = "test_project" + mixin_instance.project_dir = tmp_path / "project" + + master_path = tmp_path / "master" / "ABC123" + master_path.mkdir(parents=True) + + symlink_dir = ( + tmp_path / "project" / "scitex" / "scholar" / "library" / "test_project" + ) + symlink_dir.mkdir(parents=True) + + # Create a regular file (not symlink) + regular_file = symlink_dir / "regular_file.txt" + regular_file.write_text("This is a regular file") + + # Create symlink + result = mixin_instance._create_project_local_symlink( + master_path, "new_symlink" + ) + + # Regular file should still exist + assert regular_file.exists() + assert not regular_file.is_symlink() + + # New symlink should be created + assert result.exists() + + def test_handles_broken_symlinks_gracefully(self, mixin_instance, tmp_path): + """Should handle broken symlinks without crashing.""" + mixin_instance.project = "test_project" + mixin_instance.project_dir = tmp_path / "project" + + master_path = tmp_path / "master" / "ABC123" + master_path.mkdir(parents=True) + + symlink_dir = ( + tmp_path / "project" / "scitex" / "scholar" / "library" / "test_project" + ) + symlink_dir.mkdir(parents=True) + + # Create a broken symlink (target doesn't exist) + broken_symlink = symlink_dir / "broken_link" + broken_symlink.symlink_to("/nonexistent/path") + + # This should not crash + result = mixin_instance._create_project_local_symlink( + master_path, "new_symlink" + ) + + assert result.exists() + # Broken symlink should remain (since target doesn't match) + assert broken_symlink.is_symlink() + + def test_removes_only_matching_master_id(self, mixin_instance, tmp_path): + """Should only remove symlinks pointing to the same master ID.""" + mixin_instance.project = "test_project" + mixin_instance.project_dir = tmp_path / "project" + + master_path_1 = tmp_path / "master" / "ABC123" + master_path_2 = tmp_path / "master" / "DEF456" + master_path_1.mkdir(parents=True) + master_path_2.mkdir(parents=True) + + symlink_dir = ( + tmp_path / "project" / "scitex" / "scholar" / "library" / "test_project" + ) + symlink_dir.mkdir(parents=True) + + # Create symlinks to different masters + old_symlink_1 = symlink_dir / "Old_Name_1" + old_symlink_1.symlink_to(master_path_1.resolve()) + + other_symlink = symlink_dir / "Other_Master" + other_symlink.symlink_to(master_path_2.resolve()) + + # Create new symlink for master 1 with different name + result = mixin_instance._create_project_local_symlink( + master_path_1, "New_Name_1" + ) + + # Old symlink for master 1 should be removed + assert not old_symlink_1.exists() + + # Symlink for other master should remain + assert other_symlink.exists() + assert other_symlink.resolve() == master_path_2.resolve() + + # New symlink should exist + assert result.exists() + assert result.resolve() == master_path_1.resolve() + + +class TestCreateProjectLocalSymlinkReturnValue: + """Test return values of _create_project_local_symlink.""" + + @pytest.fixture + def mixin_instance(self, tmp_path, SymlinkHandlersMixin): + """Create a minimal instance with the mixin.""" + + class FakeLibraryManager(SymlinkHandlersMixin): + def __init__(self, project=None, project_dir=None): + self.project = project + self.project_dir = project_dir + + return FakeLibraryManager() + + def test_returns_path_object_on_success(self, mixin_instance, tmp_path): + """Should return Path object when symlink is created successfully.""" + mixin_instance.project = "test_project" + mixin_instance.project_dir = tmp_path / "project" + + master_path = tmp_path / "master" / "ABC123" + master_path.mkdir(parents=True) + + result = mixin_instance._create_project_local_symlink(master_path, "paper") + + assert isinstance(result, Path) + assert result.exists() + + def test_return_path_is_correct_path(self, mixin_instance, tmp_path): + """Returned path should match the created symlink path.""" + mixin_instance.project = "test_project" + mixin_instance.project_dir = tmp_path / "project" + + master_path = tmp_path / "master" / "ABC123" + master_path.mkdir(parents=True) + readable_name = "PDF-01_CC_IF" + + result = mixin_instance._create_project_local_symlink( + master_path, readable_name + ) + + expected = ( + tmp_path + / "project" + / "scitex" + / "scholar" + / "library" + / "test_project" + / readable_name + ) + assert result == expected + + def test_returns_none_on_missing_project_dir(self, mixin_instance, tmp_path): + """Should return None if project_dir is None.""" + mixin_instance.project = "test_project" + mixin_instance.project_dir = None + + master_path = tmp_path / "master" / "ABC123" + master_path.mkdir(parents=True) + + result = mixin_instance._create_project_local_symlink(master_path, "paper") + assert result is None + + def test_returns_none_on_master_project(self, mixin_instance, tmp_path): + """Should return None if project is 'master'.""" + mixin_instance.project = "master" + mixin_instance.project_dir = tmp_path / "project" + mixin_instance.project_dir.mkdir(parents=True) + + master_path = tmp_path / "master" / "ABC123" + master_path.mkdir(parents=True) + + result = mixin_instance._create_project_local_symlink(master_path, "paper") + assert result is None + + +class TestCreateProjectLocalSymlinkEdgeCases: + """Test edge cases and special scenarios.""" + + @pytest.fixture + def mixin_instance(self, tmp_path, SymlinkHandlersMixin): + """Create a minimal instance with the mixin.""" + + class FakeLibraryManager(SymlinkHandlersMixin): + def __init__(self, project=None, project_dir=None): + self.project = project + self.project_dir = project_dir + + return FakeLibraryManager() + + def test_handles_special_characters_in_readable_name( + self, mixin_instance, tmp_path + ): + """Should handle special characters in readable_name.""" + mixin_instance.project = "test_project" + mixin_instance.project_dir = tmp_path / "project" + + master_path = tmp_path / "master" / "ABC123" + master_path.mkdir(parents=True) + + readable_name = "PDF-01_CC-000100_IF-005_2024_Smith-Jones_Nature-Science" + result = mixin_instance._create_project_local_symlink( + master_path, readable_name + ) + + assert result is not None + assert result.exists() + assert result.name == readable_name + + def test_handles_long_readable_name(self, mixin_instance, tmp_path): + """Should handle long readable names.""" + mixin_instance.project = "test_project" + mixin_instance.project_dir = tmp_path / "project" + + master_path = tmp_path / "master" / "ABC123" + master_path.mkdir(parents=True) + + readable_name = "PDF-01_CC-999999_IF-999_2024_VeryLongAuthorName_VeryLongJournalNameThatExceedsNormalLength" + result = mixin_instance._create_project_local_symlink( + master_path, readable_name + ) + + assert result is not None + assert result.exists() + assert result.name == readable_name + + def test_handles_paths_with_spaces(self, mixin_instance, tmp_path): + """Should handle paths with spaces.""" + mixin_instance.project = "test project" + project_dir = tmp_path / "my project" + mixin_instance.project_dir = project_dir + + master_path = tmp_path / "master" / "ABC123" + master_path.mkdir(parents=True) + + result = mixin_instance._create_project_local_symlink(master_path, "paper name") + + assert result is not None + assert result.exists() + # Path should contain spaces correctly + assert "test project" in str(result) + assert "paper name" in str(result) + + def test_handles_nested_master_paths(self, mixin_instance, tmp_path): + """Should handle deeply nested master storage paths.""" + mixin_instance.project = "test_project" + mixin_instance.project_dir = tmp_path / "project" + + # Create nested path + master_path = tmp_path / "archive" / "deep" / "nested" / "master" / "ABC123" + master_path.mkdir(parents=True) + + result = mixin_instance._create_project_local_symlink(master_path, "paper") + + assert result is not None + assert result.exists() + # Symlink target should resolve to the correct master path + assert result.resolve() == master_path.resolve() + + def test_idempotency_same_call_twice(self, mixin_instance, tmp_path): + """Calling with same arguments twice should be idempotent.""" + mixin_instance.project = "test_project" + mixin_instance.project_dir = tmp_path / "project" + + master_path = tmp_path / "master" / "ABC123" + master_path.mkdir(parents=True) + readable_name = "Paper_Name" + + result1 = mixin_instance._create_project_local_symlink( + master_path, readable_name + ) + result2 = mixin_instance._create_project_local_symlink( + master_path, readable_name + ) + + # Both calls should return same path + assert result1 == result2 + # Path should exist after both calls + assert result1.exists() + assert result2.exists() + # Should point to same target + assert result1.resolve() == result2.resolve() + + +# EOF diff --git a/tests/scitex/stats/descriptive/test__describe.py b/tests/scitex/stats/descriptive/test__describe.py old mode 100644 new mode 100755 index 26ed40906..9696ef24a --- a/tests/scitex/stats/descriptive/test__describe.py +++ b/tests/scitex/stats/descriptive/test__describe.py @@ -24,8 +24,9 @@ def test_basic_describe(self): 7, ), f"Expected shape (10, 7), got {described.shape}" assert len(names) == 7, "Should return 7 stat names" - assert "nanmean" in names - assert "nanstd" in names + assert "mean" in names + assert "std" in names + assert "median" in names def test_with_nans(self): """Test with NaN values.""" @@ -63,9 +64,9 @@ def test_all_funcs(self): described, names = describe(x, dim=-1, funcs="all") assert len(names) > 7, "Should return all available functions" - assert "nanmax" in names - assert "nanmin" in names - assert "nancount" in names + assert "max" in names + assert "min" in names + assert "count" in names def test_custom_funcs(self): """Test with custom function list."""