Skip to content

Commit 3de8b89

Browse files
committed
[DNM] tempest scraper
1 parent 4ad50f8 commit 3de8b89

4 files changed

Lines changed: 163 additions & 0 deletions

File tree

data_scraper/common/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
JIRA_COLLECTION_NAME = "rca-knowledge-base"
44
OSP_DOCS_COLLECTION_NAME = "rca-osp-docs-knowledge-base"
55
ERRATA_COLLECTION_NAME = "rca-errata"
6+
TEMPEST_COLLECTION_NAME = "rca-tempest"
67
DEFAULT_EMBEDDING_MODEL = "BAAI/bge-m3"
78
DEFAULT_JIRA_URL = "https://issues.redhat.com"
89
DEFAULT_JIRA_PROJECTS = {
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
"""Code for scraping Tempest data"""
2+
import logging
3+
import multiprocessing as mp
4+
import subprocess
5+
import sys
6+
from datetime import datetime
7+
from typing import TypedDict
8+
import regex as re
9+
10+
import pandas as pd
11+
12+
from data_scraper.core.scraper import Scraper
13+
from data_scraper.processors.tempest_provider import TempestProvider
14+
15+
16+
LOG = logging.getLogger(__name__)
17+
LOG.setLevel(logging.INFO)
18+
19+
20+
class TempestRecord(TypedDict):
21+
"""Represents a record extracted from Tempest report.
22+
23+
Attributes:
24+
test_name: of the failed test
25+
traceback: of the failed test
26+
report_url: URL of the tempest report
27+
28+
"""
29+
test_name: str
30+
traceback: str
31+
report_url: str
32+
33+
34+
35+
class TempestScraper(Scraper):
36+
"""Main class for Tempest scraping and processing."""
37+
38+
def __init__(self, config: dict):
39+
super().__init__(config=config)
40+
self.config = config
41+
self.tempest_provider = TempestProvider(self.config["zuul_url"])
42+
43+
44+
def get_documents(self) -> list[dict]:
45+
46+
47+
return results
48+
49+
def get_records(self, documents: list[dict]) -> list[TempestRecord]:
50+
tempest_records: list[TempestRecord] = []
51+
52+
53+
54+
return errata_records
55+
56+
def get_chunks(self, record: dict) -> list[str]:
57+
chunks = []
58+
59+
60+
return chunks
61+
62+
63+
def cleanup_records(
64+
self, records: list, backup_path: str = "tempest_all_data.pickle"
65+
) -> list:
66+
pass

data_scraper/main.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from data_scraper.common import constants
77
from data_scraper.core.scraper import JiraScraper, OSPDocScraper
88
from data_scraper.core.errata_scraper import ErrataScraper
9+
from data_scraper.core.tempest_scraper import TempestScraper
910

1011
logging.basicConfig(
1112
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
@@ -178,3 +179,57 @@ def errata_scraper() -> None:
178179

179180
scraper = ErrataScraper(config_args)
180181
scraper.run()
182+
183+
184+
185+
186+
def tempest_scraper() -> None:
187+
"""Entry point for command line execution."""
188+
parser = ArgumentParser("tempest_scraper")
189+
190+
# Required arguments
191+
parser.add_argument("--database_client_url", type=str, required=True)
192+
parser.add_argument("--llm_server_url", type=str, required=True)
193+
parser.add_argument("--llm_api_key", type=str, required=True)
194+
parser.add_argument("--database_api_key", type=str, required=True)
195+
parser.add_argument("--zuul-url", type=str, required=True)
196+
197+
198+
199+
# Optional arguments
200+
parser.add_argument("--chunk_size", type=int,
201+
default=constants.DEFAULT_CHUNK_SIZE)
202+
parser.add_argument("--embedding_model", type=str,
203+
default=constants.DEFAULT_EMBEDDING_MODEL)
204+
parser.add_argument("--db_collection_name", type=str,
205+
default=constants.TEMPEST_COLLECTION_NAME)
206+
parser.add_argument("--scraper-processes", type=int,
207+
default=constants.DEFAULT_NUM_SCRAPER_PROCESSES)
208+
parser.add_argument("--date_cutoff", type=datetime.fromisoformat,
209+
default=datetime.fromisoformat(constants.DEFAULT_DATE_CUTOFF),
210+
help=(
211+
"No issues from before this date will be used. "
212+
"Date must follow ISO format 'YYYY-MM-DD'"
213+
)
214+
)
215+
parser.add_argument("--recreate_collection", type=bool, default=True,
216+
help="Recreate database collection from scratch.")
217+
args = parser.parse_args()
218+
219+
config_args = {
220+
"database_client_url": args.database_client_url,
221+
"llm_server_url": args.llm_server_url,
222+
"llm_api_key": args.llm_api_key,
223+
"database_api_key": args.database_api_key,
224+
"chunk_size": args.chunk_size,
225+
"embedding_model": args.embedding_model,
226+
"db_collection_name": args.db_collection_name,
227+
"zuul_url": args.errata_url,
228+
"scraper_processes": args.scraper_processes,
229+
"date_cutoff": args.date_cutoff,
230+
"recreate_collection": args.recreate_collection,
231+
}
232+
233+
scraper = TempestScraper(config_args)
234+
scraper.run()
235+
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""Client to fetch Tempest."""
2+
import logging
3+
import requests
4+
5+
LOG = logging.getLogger(__name__)
6+
LOG.setLevel(logging.INFO)
7+
8+
class TempestProvider:
9+
"""Provider for failed tempest tests for unijobs."""
10+
11+
def __init__(self, query_url: str):
12+
self.query_url = query_url
13+
14+
def search_tempest_repords_with_failure(self) -> list:
15+
"""
16+
Returns:
17+
list: of urls to report.html
18+
"""
19+
20+
21+
return []
22+
23+
def get_testname_traceback_pairs(self, report_urls: list) -> dict:
24+
"""For every entry on a list of urls issue a call to parser endpoint
25+
Get a list of (testname,traceback) pairs and add it to dictionary
26+
27+
Args:
28+
report_urls: [ <address_to>/tepest_report1.html, ..., <address_to>/tepest_reportN.html ]
29+
30+
Returns:
31+
dict:
32+
{ <address_to>/tepest_report1.html: [ [test1, traceback1], ... [testN, tracebackN] ],
33+
...,
34+
<address_to>/tepest_reportM.html: [ [test1, traceback1], ... [testK, tracebackK] ]
35+
}
36+
"""
37+
LOG.info("Requesting tempest reports -> %s", query)
38+
39+
40+
41+
return {}

0 commit comments

Comments
 (0)