diff --git a/.github/workflows/repo-checks.yml b/.github/workflows/repo-checks.yml index 394a8673f..07445efc0 100644 --- a/.github/workflows/repo-checks.yml +++ b/.github/workflows/repo-checks.yml @@ -3,6 +3,20 @@ name: Repo Checks on: ["push", "pull_request"] jobs: + check_python_quality: + name: Code Quality + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v6 + with: + python-version: "3.11" + - run: pip install -r llm/requirements.txt + - run: ruff check llm/ + - run: ruff format --check llm/ + - run: pytest llm/ + check_code_quality: name: Code Quality runs-on: ubuntu-latest diff --git a/llm/llm_functions.py b/llm/llm_functions.py index 78e9d13e1..d17e3a8f6 100644 --- a/llm/llm_functions.py +++ b/llm/llm_functions.py @@ -1,20 +1,20 @@ """ This code implements text summarization, category selection and tagging bills using templated LLM prompts. -The main functions and their objectives are: -1. get_summary_api_function: Function used to summarize a bill - It takes in bill id, bill title and bill text +The main functions and their objectives are: +1. get_summary_api_function: Function used to summarize a bill - It takes in bill id, bill title and bill text and returns summary of the bill. 2. get_tags_api_function: Function used to tag a bill with pre specified tags - It takes in bill id, bill title and bill text and returns the selected tags from specified tags. - -3. get_summaries_and_tags_api_function: Combined function that generates both summary and tags in a single call - - It takes in bill id, bill title and bill text, first generates a summary, and then - uses this summary to generate relevant tags. This approach ensures tags are based on + +3. get_summaries_and_tags_api_function: Combined function that generates both summary and tags in a single call - + It takes in bill id, bill title and bill text, first generates a summary, and then + uses this summary to generate relevant tags. This approach ensures tags are based on the distilled information in the summary rather than the full bill text. -4. get_tags_api_function_v2: Optimized version of tag generation that works with bill summaries - It takes in - bill id, bill title and bill summary (instead of full text) to generate tags. This +4. get_tags_api_function_v2: Optimized version of tag generation that works with bill summaries - It takes in + bill id, bill title and bill summary (instead of full text) to generate tags. This version provides more focused tagging by working with already-distilled information. Note: @@ -23,18 +23,14 @@ - Templates for prompts are maintained separately to ensure consistency across different parts of the application """ -import json + import numpy as np -import os import pandas as pd import tiktoken -import streamlit as st -import urllib.request import chromadb import re import requests -from chromadb.config import Settings from dataclasses import dataclass, field from langchain.globals import set_llm_cache @@ -53,27 +49,37 @@ from operator import itemgetter from pathlib import Path -from rouge_score import rouge_scorer from requests.exceptions import RequestException -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.metrics.pairwise import cosine_similarity -from typing import Tuple, List +from typing import List from requests.packages.urllib3.exceptions import InsecureRequestWarning -from prompts import * -from tag_categories import * +from prompts import ( + TAGGING_PROMPT_LARGE, + TAGGING_PROMPT_SMALL, + CATEGORIZATION_PROMPT_SMALL, + SUMMARIZATION_PROMPT_SMALL, + CATEGORIZATION_PROMPT_LARGE, + SUMMARIZATION_PROMPT_LARGE, + TAGGING_PROMPT_USING_SUMMARIES, +) +from tag_categories import ( + new_categories_for_bill_list, + new_tags, + new_tags_for_bill_dict, +) from normalize_summaries import normalize_summary -GPT_MDOEL_VERSION = 'gpt-4o-mini' +GPT_MDOEL_VERSION = "gpt-4o-mini" MAX_TOKEN_LIMIT = 128000 CHROMA_DB_PATH = "./databases/chroma_db" LLM_CACHE = Path("./databases/llm_cache.db") -API_KEY = '' # Optional: Add API Key here if you want to use legacy functions +API_KEY = "" # Optional: Add API Key here if you want to use legacy functions -def extract_sections(bill_text: str) -> list[tuple[str, str]]: + +def extract_sections(bill_text: str) -> list[tuple[str, str]]: """ Extracts chapters and sections from a bill using regular expressions. @@ -86,99 +92,106 @@ def extract_sections(bill_text: str) -> list[tuple[str, str]]: regex = "" chapter = "" section = "" - - + check_list = ["section", "chapter"] - #Regex to extract strings containing "section# of chapter#, 'section# of said chapter#' , 'section#' or 'chapter#' in a list - regex = re.findall(r'(section)(\s+\d+[a-zA-Z]+|\s+\d+)\s+(of|of\ssaid)\s(chapter)(\s+\d+[a-zA-Z]+|\s\d+)|(section|chapter)(\s+\d+[a-zA-Z]+|\s\d+)',str(bill_text), re.IGNORECASE) - - lists_with_both =[] + # Regex to extract strings containing "section# of chapter#, 'section# of said chapter#' , 'section#' or 'chapter#' in a list + regex = re.findall( + r"(section)(\s+\d+[a-zA-Z]+|\s+\d+)\s+(of|of\ssaid)\s(chapter)(\s+\d+[a-zA-Z]+|\s\d+)|(section|chapter)(\s+\d+[a-zA-Z]+|\s\d+)", + str(bill_text), + re.IGNORECASE, + ) + + lists_with_both = [] current_chapter = "" - #iterate over regex list that contains both chapters and sections - for x in regex: + # iterate over regex list that contains both chapters and sections + for x in regex: items = [] - #iterate over extracted lists + # iterate over extracted lists for item in x: item = item.casefold() - items.append(item) + items.append(item) if all(name in items for name in check_list): for i, j in enumerate(x): - if j.casefold() == "section": - section = x[i+1].strip() - + section = x[i + 1].strip() + if j.casefold() == "chapter": - chapter = x[i+1].strip() + chapter = x[i + 1].strip() + + # save current chapter in order to use it for pairing with sections mentioned later + current_chapter = chapter - #save current chapter in order to use it for pairing with sections mentioned later - current_chapter = chapter - - #add chapter and section to the list + # add chapter and section to the list list_with_both = [chapter, section] - - #only keep new/unique chapter and section pairs + + # only keep new/unique chapter and section pairs if list_with_both not in lists_with_both: lists_with_both.append(list_with_both) else: - #iterate over list that contains only sections or chapters + # iterate over list that contains only sections or chapters for i, j in enumerate(x): - #ignore SECTION with caps as it indicates sections from the bills and not the MGL - if j == "SECTION": + # ignore SECTION with caps as it indicates sections from the bills and not the MGL + if j == "SECTION": continue if j == "": continue - + else: if j.casefold() == "chapter": - current_chapter = x[i+1].strip() #keep track of current chapter - + current_chapter = x[ + i + 1 + ].strip() # keep track of current chapter + if j.casefold() == "section": - if x[i+1] == "": + if x[i + 1] == "": continue else: - section = x[i+1].strip() - + section = x[i + 1].strip() + list_with_both = [current_chapter, section] if list_with_both not in lists_with_both: lists_with_both.append(list_with_both) - + return lists_with_both + def query_section_text(chapter_section_list: tuple[str, str]) -> str | float: - """ + """ Makes an API call to retrieve text data based on the provided chapter and section. - + Parameters: - chapter_section_list (list): A list containing two elements - chapter and section, e.g., ['2', '15D']. - + Returns: - result (str): The text data retrieved from the API. - + Note: - This function uses the malegislature.gov API to fetch text data for a specific chapter and section. - - """ + + """ result = """""" - + try: # unpack section and chapter for example: ['2', '15D'] chapter, section = chapter_section_list - link = f'https://malegislature.gov/api/Chapters/{chapter}/Sections/{section}' + link = f"https://malegislature.gov/api/Chapters/{chapter}/Sections/{section}" r = requests.get(link, verify=False) r = r.json() - + # fields to extract - result = r.get("Text", np.nan) + result = r.get("Text", np.nan) return result - except RequestException as e: - + except RequestException: pass -def query_section_text_all_bills(chapter_section_lists: list[tuple[str, str]]) -> tuple[list[str], list[tuple[str, str]]]: + +def query_section_text_all_bills( + chapter_section_lists: list[tuple[str, str]], +) -> tuple[list[str], list[tuple[str, str]]]: """ Retrieves text data for each chapter-section pair in the given sample; prints chapter-section numbers to keep track of the progress. @@ -187,14 +200,14 @@ def query_section_text_all_bills(chapter_section_lists: list[tuple[str, str]]) - Returns: - formatted_data(list): A list containing formatted text data for each non-empty chapter-section pair in the sample. - - empty_responses(list): A list containing chapter section pairs where API doesn't return anything + - empty_responses(list): A list containing chapter section pairs where API doesn't return anything Note: - This function prints the provided chapter-section pairs and retrieves text data for each pair using the `make_api_call` function. - The function skips empty or None pairs and ignores pairs with empty or NaN text data. - The formatted text data for each non-empty pair is stored in a list, which is then returned. """ - + # Storing and printing each pair requests.packages.urllib3.disable_warnings(InsecureRequestWarning) formatted_data = [] @@ -205,36 +218,39 @@ def query_section_text_all_bills(chapter_section_lists: list[tuple[str, str]]) - if str(chapter_section_lists) == "nan": return - + # Iterate through each pair in the chapter_section_lists for pair in chapter_section_lists: - if len(pair) == 0: continue else: string = query_section_text(pair) if string in {None, np.nan, "", "nan"}: - empty_responses.append(pair) #get a list of chapter-section pair where the API call returns an empty list + empty_responses.append( + pair + ) # get a list of chapter-section pair where the API call returns an empty list continue else: result += string formatted_data.append(result) - + return formatted_data, empty_responses -def get_chap_sec_names_internal(chap_sec_lists: list, mgl_names_file_path: str = "./chapter_section_names.pq") -> str: - + +def get_chap_sec_names_internal( + chap_sec_lists: list, mgl_names_file_path: str = "./chapter_section_names.pq" +) -> str: """ - Fetches chapter and section names for a given bill number from a local parquet file. + Fetches chapter and section names for a given bill number from a local parquet file. TODO delete this function after we setup a robust database backend with the MGL data. - + Args: chap_sec_lists (list): list of tuples containing chapter number and section numbers. mgl_names_file_path (str): path for the file containing chapter and section names. Expected columns are `Chapter_Number`, `Section_Number`, `Chapter`, `Section Name` - + Returns: str: All chapter and section names pairs concatenated together. - + The function assumes the DataFrame has the necessary columns. """ @@ -245,15 +261,24 @@ def get_chap_sec_names_internal(chap_sec_lists: list, mgl_names_file_path: str = for tup in chap_sec_lists: chap, sec = tup try: - chapter_name = names_df[(names_df["Chapter_Number"] == chap) & (names_df["Section_Number"] == sec)]['Chapter'].values[0] - section_name = names_df[(names_df["Chapter_Number"] == chap) & (names_df["Section_Number"] == sec)]['Section Name'].values[0] + chapter_name = names_df[ + (names_df["Chapter_Number"] == chap) + & (names_df["Section_Number"] == sec) + ]["Chapter"].values[0] + section_name = names_df[ + (names_df["Chapter_Number"] == chap) + & (names_df["Section_Number"] == sec) + ]["Section Name"].values[0] names[chapter_name] = section_name - except Exception as e: + except Exception: continue return ", ".join([f"{key}: {value}" for key, value in names.items()]) -def count_tokens(bill_title:str, bill_text:str, mgl_ref:str, mgl_names:str, committee_info:str): + +def count_tokens( + bill_title: str, bill_text: str, mgl_ref: str, mgl_names: str, committee_info: str +): """ Outputs the number of tokens for the given documents @@ -267,69 +292,80 @@ def count_tokens(bill_title:str, bill_text:str, mgl_ref:str, mgl_names:str, comm Returns: int: token_count """ - + encoding = tiktoken.encoding_for_model(GPT_MDOEL_VERSION) - text = str(bill_title) + str(bill_text) + str(mgl_ref) + str(mgl_names) + str(committee_info) + text = ( + str(bill_title) + + str(bill_text) + + str(mgl_ref) + + str(mgl_names) + + str(committee_info) + ) token_count = len(encoding.encode(text)) print(f"The text contains {token_count} tokens.") return token_count -def set_my_llm_cache(cache_file: Path=LLM_CACHE) -> SQLiteCache: + +def set_my_llm_cache(cache_file: Path = LLM_CACHE) -> SQLiteCache: """ Set an LLM cache, which allows for previously executed completions to be loaded from disk instead of repeatedly queried. """ cache_file.parent.mkdir(exist_ok=True) - set_llm_cache(SQLiteCache(database_path = cache_file)) + set_llm_cache(SQLiteCache(database_path=cache_file)) + @dataclass() -class BillDetails(): - ''' - A class to store all the details pertaining to a bill. - ''' - - bill_id: str = '' - bill_title: str = '' - bill_text: str = '' - mgl_ref: str = '' - committee_info: str = '' - mgl_names: str = '' +class BillDetails: + """ + A class to store all the details pertaining to a bill. + """ + + bill_id: str = "" + bill_title: str = "" + bill_text: str = "" + mgl_ref: str = "" + committee_info: str = "" + mgl_names: str = "" invoke_dict: dict = field(default_factory=list) - summary: str = '' + summary: str = "" -@dataclass() -class LLMResults: - ''' +@dataclass() +class LLMResults: + """ A class to store the results of the LLM. - ''' - query: str = '' - response: str = '' + """ -def extract_bill_context(bill_text: str) -> tuple: + query: str = "" + response: str = "" - ''' + +def extract_bill_context(bill_text: str) -> tuple: + """ This function takes in bill text, extracts the referenced MGL sections and returns them - Arguments: + Arguments: bill_text (str): Actual bill text Returns: A tuple of (combined_mgl, mgl_names) combined_mgl (str): All the relevant MGL section strings concatenated to a big string mgl_names (tuple): Tuple of referenced MGL section numbers - ''' + """ sections = extract_sections(bill_text) mgl_list, empty_responses = query_section_text_all_bills(sections) - combined_mgl = ' '.join(mgl_list) if len(mgl_list) != 0 else "None" + combined_mgl = " ".join(mgl_list) if len(mgl_list) != 0 else "None" mgl_names = get_chap_sec_names_internal(sections) return combined_mgl, mgl_names -def get_summaries_and_tags_api_function(bill_id: str, bill_title: str, bill_text: str) -> dict: +def get_summaries_and_tags_api_function( + bill_id: str, bill_title: str, bill_text: str +) -> dict: """ Generates both a summary and relevant tags for a given legislative bill in a single API call. @@ -337,8 +373,8 @@ def get_summaries_and_tags_api_function(bill_id: str, bill_title: str, bill_text 1. Generates a summary of the bill using get_summary_api_function 2. Uses this summary to generate relevant tags using get_tags_api_function_v2 - The sequential processing ensures that tags are generated based on the distilled - information in the summary rather than the full bill text, potentially improving + The sequential processing ensures that tags are generated based on the distilled + information in the summary rather than the full bill text, potentially improving tagging accuracy and consistency. Args: @@ -366,31 +402,29 @@ def get_summaries_and_tags_api_function(bill_id: str, bill_title: str, bill_text - If summary generation fails, tag generation is not attempted """ - response_obj = { - 'status': -1, - 'summary': '', - 'tags': [] - } + response_obj = {"status": -1, "summary": "", "tags": []} # Get the summary summary_response = get_summary_api_function(bill_id, bill_title, bill_text) response_obj.update(summary_response) - if response_obj['summary'] == '' or response_obj['status'] != 1: + if response_obj["summary"] == "" or response_obj["status"] != 1: return response_obj # Get tags - tags_response = get_tags_api_function_v2(bill_id, bill_title, response_obj['summary']) + tags_response = get_tags_api_function_v2( + bill_id, bill_title, response_obj["summary"] + ) response_obj.update(tags_response) return response_obj -def get_summary_api_function(bill_id: str, bill_title: str, bill_text: str) -> dict: +def get_summary_api_function(bill_id: str, bill_title: str, bill_text: str) -> dict: """ Generates a summary for a given legislative bill. - This function processes the input bill information, extracts relevant context from the + This function processes the input bill information, extracts relevant context from the Massachusetts General Laws (MGL), and uses a language model to generate a concise summary. Args: @@ -418,32 +452,32 @@ def get_summary_api_function(bill_id: str, bill_title: str, bill_text: str) -> d """ # extract relevant mgl text combined_mgl, mgl_names = extract_bill_context(bill_text) - + # create bill_details object bill_details = BillDetails( - bill_id = bill_id, - bill_title = bill_title, - bill_text = bill_text, - mgl_ref = combined_mgl, - committee_info = 'None:None', - mgl_names = mgl_names, + bill_id=bill_id, + bill_title=bill_title, + bill_text=bill_text, + mgl_ref=combined_mgl, + committee_info="None:None", + mgl_names=mgl_names, ) # call the summary function status_code, results = get_summary(bill_details) # return response attribute of returned value - if status_code != 1: - return {'status': status_code, 'summary': ''} - else: - return {'status': status_code, 'summary': normalize_summary(results.response)} + if status_code != 1: + return {"status": status_code, "summary": ""} + else: + return {"status": status_code, "summary": normalize_summary(results.response)} + def get_tags_api_function(bill_id: str, bill_title: str, bill_text: str) -> dict: - """ Generates relevant tags for a given legislative bill. - This function processes the input bill information, extracts relevant context from the + This function processes the input bill information, extracts relevant context from the Massachusetts General Laws (MGL), and uses a language model to generate appropriate tags. Args: @@ -472,28 +506,28 @@ def get_tags_api_function(bill_id: str, bill_title: str, bill_text: str) -> dict # extract relevant mgl text combined_mgl, mgl_names = extract_bill_context(bill_text) - + # create bill_details object bill_details = BillDetails( - bill_id = bill_id, - bill_title = bill_title, - bill_text = bill_text, - mgl_ref = combined_mgl, - committee_info = 'None:None', - mgl_names = mgl_names, + bill_id=bill_id, + bill_title=bill_title, + bill_text=bill_text, + mgl_ref=combined_mgl, + committee_info="None:None", + mgl_names=mgl_names, ) # call the summary function status_code, results = get_tags(bill_details) # return response attribute of returned value - if status_code != 1: - return {'status': status_code, 'tags': []} - else: - return {'status': status_code, 'tags': results.response} + if status_code != 1: + return {"status": status_code, "tags": []} + else: + return {"status": status_code, "tags": results.response} -def get_tags_api_function_v2(bill_id: str, bill_title: str, bill_summary: str) -> dict: +def get_tags_api_function_v2(bill_id: str, bill_title: str, bill_summary: str) -> dict: """ Generates tags for a legislative bill using its summary instead of full text. @@ -529,26 +563,25 @@ def get_tags_api_function_v2(bill_id: str, bill_title: str, bill_summary: str) - """ bill_details = BillDetails( - bill_id = bill_id, - bill_title = bill_title, - summary = bill_summary + bill_id=bill_id, bill_title=bill_title, summary=bill_summary ) status_code, results = get_tags_v2(bill_details) - if status_code != 1: - return {'status': status_code, 'tags': []} - else: - return {'status': status_code, 'tags': results.response} + if status_code != 1: + return {"status": status_code, "tags": []} + else: + return {"status": status_code, "tags": results.response} + def get_llm_call_type(bill_details: BillDetails) -> str: """ This function calculates number of tokens and decides on weather to use RAG or not. It reutrns a string output - that specifies how to call the LLM. + that specifies how to call the LLM. - Args: + Args: bill_details (BillDetails): object consisting of bill_text, bill_title, mgl_ref, commottee_info, mgl_names - Returns: + Returns: str: 'large' or 'small' depeneding upon token count """ @@ -560,44 +593,53 @@ def get_llm_call_type(bill_details: BillDetails) -> str: mgl_names = getattr(bill_details, "mgl_names") num_tokens = count_tokens(bill_title, bill_text, mgl_ref, mgl_names, committee_info) - - return 'small' if num_tokens < MAX_TOKEN_LIMIT - 5000 else 'large' + + return "small" if num_tokens < MAX_TOKEN_LIMIT - 5000 else "large" + def get_category_tags(categories: List) -> List: """ - This function takes in list of categories and returns tags pertinant to that specifc categories only. + This function takes in list of categories and returns tags pertinant to that specifc categories only. - Args: + Args: categories (List(str)): List of category strings. - Returns: - List of all tags specific to those of categories. + Returns: + List of all tags specific to those of categories. """ - tags_tuple = itemgetter(*set.intersection(set(categories), set(new_categories_for_bill_list)))(new_tags_for_bill_dict) - - if isinstance(tags_tuple, list): return tags_tuple + tags_tuple = itemgetter( + *set.intersection(set(categories), set(new_categories_for_bill_list)) + )(new_tags_for_bill_dict) + + if isinstance(tags_tuple, list): + return tags_tuple category_tags = [] - for cts in tags_tuple: category_tags += cts + for cts in tags_tuple: + category_tags += cts return category_tags + def get_summary(bill_details: BillDetails) -> tuple[int, LLMResults]: - ''' - This function takes in bill details object (bill title, bill text and reference mgl section text) and summarizes the bill. + """ + This function takes in bill details object (bill title, bill text and reference mgl section text) and summarizes the bill. - Arguments: + Arguments: bill_details (BillDetails): Object containing information about the bill - bill_text, bill_title, mgl_ref, commottee_info, mgl_names - Returns: + Returns: A tuple of status_code and an LLMResults object containing query, response from the LLM status_code can take these following values {1: Success, -1: Necessary details not found} - ''' + """ - if not all(hasattr(bill_details, attr) for attr in ("bill_text", 'bill_title', 'mgl_names', 'committee_info')): + if not all( + hasattr(bill_details, attr) + for attr in ("bill_text", "bill_title", "mgl_names", "committee_info") + ): return -1, LLMResults() set_my_llm_cache() @@ -606,8 +648,8 @@ def get_summary(bill_details: BillDetails) -> tuple[int, LLMResults]: query = get_query_for_summarization(bill_details, llm_call_type) return 1, call_llm(bill_details, query, llm_call_type) -def get_tags(bill_details: BillDetails) -> tuple[int, LLMResults]: +def get_tags(bill_details: BillDetails) -> tuple[int, LLMResults]: """ Tags a legislative bill using a two-step process involving categorization and LLM-based tag selection. @@ -636,7 +678,10 @@ def get_tags(bill_details: BillDetails) -> tuple[int, LLMResults]: The function requires all necessary bill details to be present in the BillDetails object for successful execution. """ - if not all(hasattr(bill_details, attr) for attr in ("bill_text", 'bill_title', 'mgl_names', 'committee_info')): + if not all( + hasattr(bill_details, attr) + for attr in ("bill_text", "bill_title", "mgl_names", "committee_info") + ): return -1, LLMResults() set_my_llm_cache() @@ -652,12 +697,14 @@ def get_tags(bill_details: BillDetails) -> tuple[int, LLMResults]: tag_response = call_llm(bill_details, query_2, llm_call_type) # parses the response from LLM and removes hallucinated tags - tag_response.response = list(set(extract_categories_tags(tag_response.response)) & set(category_tags)) + tag_response.response = list( + set(extract_categories_tags(tag_response.response)) & set(category_tags) + ) return 1, tag_response -def get_tags_v2(bill_details: BillDetails) -> LLMResults: +def get_tags_v2(bill_details: BillDetails) -> LLMResults: """ Helper function that generates tags for a bill using its summary. @@ -692,24 +739,26 @@ def get_tags_v2(bill_details: BillDetails) -> LLMResults: - Relies on TAGGING_PROMPT_USING_SUMMARIES template from prompts.py """ - if not all(hasattr(bill_details, attr) for attr in ("summary", "bill_title")): + if not all(hasattr(bill_details, attr) for attr in ("summary", "bill_title")): return -2, LLMResults() set_my_llm_cache() - llm_call_type = 'small' + llm_call_type = "small" query = TAGGING_PROMPT_USING_SUMMARIES bill_details.invoke_dict = { - 'bill_title': bill_details.bill_title, - 'context': [Document(page_content = f"```{bill_details.summary}```")], - 'tags': new_tags + "bill_title": bill_details.bill_title, + "context": [Document(page_content=f"```{bill_details.summary}```")], + "tags": new_tags, } tag_response = call_llm(bill_details, query, llm_call_type) - tag_response.response = list(set(extract_categories_tags(tag_response.response)) & set(new_tags)) + tag_response.response = list( + set(extract_categories_tags(tag_response.response)) & set(new_tags) + ) return 1, tag_response -def extract_categories_tags(response: str) -> list: +def extract_categories_tags(response: str) -> list: """ Extracts categories or tags from a string response. @@ -731,43 +780,51 @@ def extract_categories_tags(response: str) -> list: Empty elements (resulting from consecutive '#' characters) will be removed from the final list. """ - response = response.split('#') + response = response.split("#") return [i.strip() for i in response] + def prepare_invoke_dict(bill_details: BillDetails) -> dict: """ - This function prepares the dict object that is used in chain.invoke function to call the LLM with prompt and - required details. + This function prepares the dict object that is used in chain.invoke function to call the LLM with prompt and + required details. - Args: + Args: bill_details (BillDetails): Object containing information about the bill - bill_text, bill_title, mgl_ref, commottee_info, mgl_names - Returns: - dict object containing all the necessary keys and values required for invoke call. + Returns: + dict object containing all the necessary keys and values required for invoke call. """ - text_splitter = CharacterTextSplitter(chunk_size = 90000, chunk_overlap = 1000) + text_splitter = CharacterTextSplitter(chunk_size=90000, chunk_overlap=1000) return { - "title": bill_details.bill_title, - "context": [Document(page_content = f"```{x}```") for x in text_splitter.split_text(bill_details.bill_text)], - "names": bill_details.mgl_names, - "mgl_sections": [Document(page_content = f"```{x}```") for x in text_splitter.split_text(bill_details.mgl_ref)], - "committee_info": bill_details.committee_info - } + "title": bill_details.bill_title, + "context": [ + Document(page_content=f"```{x}```") + for x in text_splitter.split_text(bill_details.bill_text) + ], + "names": bill_details.mgl_names, + "mgl_sections": [ + Document(page_content=f"```{x}```") + for x in text_splitter.split_text(bill_details.mgl_ref) + ], + "committee_info": bill_details.committee_info, + } + def get_query_for_summarization(bill_details: BillDetails, llm_call_type: str) -> str: """ Prepares a prompt for bill summarization based on the specified LLM call type. - This function constructs a query string for summarizing a legislative bill. It uses - predefined templates for small and large LLM call types, ensuring consistency across + This function constructs a query string for summarizing a legislative bill. It uses + predefined templates for small and large LLM call types, ensuring consistency across different parts of the application. Args: - bill_details (BillDetails): Object containing bill information. This object may + bill_details (BillDetails): Object containing bill information. This object may be modified in place for 'small' call types. - llm_call_type (str): Specifies the type of LLM call to make. + llm_call_type (str): Specifies the type of LLM call to make. Can be either "small" (standard approach) or "large" (RAG approach). Returns: @@ -775,16 +832,16 @@ def get_query_for_summarization(bill_details: BillDetails, llm_call_type: str) - for bill summarization. Note: - If llm_call_type is 'small', this function will update the `invoke_dict` attribute of + If llm_call_type is 'small', this function will update the `invoke_dict` attribute of the `bill_details` object. """ - - if llm_call_type == 'large': + + if llm_call_type == "large": query = SUMMARIZATION_PROMPT_LARGE.format( bill_title=getattr(bill_details, "bill_title"), bill_text=getattr(bill_details, "bill_text"), mgl_names=getattr(bill_details, "mgl_names"), - committee_info=getattr(bill_details, "committee_info") + committee_info=getattr(bill_details, "committee_info"), ) else: bill_details.invoke_dict = prepare_invoke_dict(bill_details) @@ -792,19 +849,19 @@ def get_query_for_summarization(bill_details: BillDetails, llm_call_type: str) - return query -def get_query_for_categorizing(bill_details: BillDetails, llm_call_type: str) -> str: +def get_query_for_categorizing(bill_details: BillDetails, llm_call_type: str) -> str: """ Prepares a prompt for bill categorization based on the specified LLM call type. - This function constructs a query string for categorizing a legislative bill. It uses - predefined templates for small and large LLM call types, ensuring consistency across + This function constructs a query string for categorizing a legislative bill. It uses + predefined templates for small and large LLM call types, ensuring consistency across different parts of the application. Args: - bill_details (BillDetails): Object containing bill information. This object may + bill_details (BillDetails): Object containing bill information. This object may be modified in place for 'small' call types. - llm_call_type (str): Specifies the type of LLM call to make. + llm_call_type (str): Specifies the type of LLM call to make. Can be either "small" (standard approach) or "large" (RAG approach). Returns: @@ -812,39 +869,45 @@ def get_query_for_categorizing(bill_details: BillDetails, llm_call_type: str) -> for bill categorization. Note: - If llm_call_type is 'small', this function will update the `invoke_dict` attribute of + If llm_call_type is 'small', this function will update the `invoke_dict` attribute of the `bill_details` object inplace. """ - if llm_call_type == 'large': + if llm_call_type == "large": query = CATEGORIZATION_PROMPT_LARGE.format( - categories=getattr(bill_details, 'categories', new_categories_for_bill_list), - bill_title=getattr(bill_details, 'bill_title'), - bill_text=getattr(bill_details, 'bill_text'), - committee_info=getattr(bill_details, 'committee_info'), - mgl_names=getattr(bill_details, "mgl_names") + categories=getattr( + bill_details, "categories", new_categories_for_bill_list + ), + bill_title=getattr(bill_details, "bill_title"), + bill_text=getattr(bill_details, "bill_text"), + committee_info=getattr(bill_details, "committee_info"), + mgl_names=getattr(bill_details, "mgl_names"), ) - else: + else: query = CATEGORIZATION_PROMPT_SMALL bill_details.invoke_dict = prepare_invoke_dict(bill_details) - bill_details.invoke_dict['categories'] = getattr(bill_details, 'categories', new_categories_for_bill_list) + bill_details.invoke_dict["categories"] = getattr( + bill_details, "categories", new_categories_for_bill_list + ) return query -def get_query_for_tagging(bill_details: BillDetails, category_tags: list, llm_call_type: str) -> str: +def get_query_for_tagging( + bill_details: BillDetails, category_tags: list, llm_call_type: str +) -> str: """ Prepares a prompt for bill tagging based on the specified LLM call type. - This function constructs a query string for tagging a legislative bill. It uses - predefined templates for small and large LLM call types, ensuring consistency across + This function constructs a query string for tagging a legislative bill. It uses + predefined templates for small and large LLM call types, ensuring consistency across different parts of the application. Args: - bill_details (BillDetails): Object containing bill information. This object may + bill_details (BillDetails): Object containing bill information. This object may be modified in place for 'small' call types. category_tags (list): List of tags that the model has to filter from. - llm_call_type (str): Specifies the type of LLM call to make. + llm_call_type (str): Specifies the type of LLM call to make. Can be either "small" (standard approach) or "large" (RAG approach). Returns: @@ -852,80 +915,86 @@ def get_query_for_tagging(bill_details: BillDetails, category_tags: list, llm_ca for bill tagging. Note: - If llm_call_type is 'small', this function will update the `invoke_dict` attribute of + If llm_call_type is 'small', this function will update the `invoke_dict` attribute of the `bill_details` object. """ - if llm_call_type == 'large': + if llm_call_type == "large": query = TAGGING_PROMPT_LARGE.format( - category_tags=', '.join(category_tags), - bill_title=getattr(bill_details, 'bill_title'), - bill_text=getattr(bill_details, 'bill_text'), - committee_info=getattr(bill_details, 'committee_info'), - mgl_names=getattr(bill_details, "mgl_names") + category_tags=", ".join(category_tags), + bill_title=getattr(bill_details, "bill_title"), + bill_text=getattr(bill_details, "bill_text"), + committee_info=getattr(bill_details, "committee_info"), + mgl_names=getattr(bill_details, "mgl_names"), ) - else: - - query =TAGGING_PROMPT_SMALL + else: + query = TAGGING_PROMPT_SMALL bill_details.invoke_dict = prepare_invoke_dict(bill_details) - bill_details.invoke_dict['category_tags'] = category_tags + bill_details.invoke_dict["category_tags"] = category_tags return query -def call_llm(bill_details: BillDetails, query: str, llm_call_type: str = 'small') -> LLMResults: + +def call_llm( + bill_details: BillDetails, query: str, llm_call_type: str = "small" +) -> LLMResults: """ - + This is a generic function that calls the LLM with given query - Args: + Args: bill_details (BillDetails): Object containing information about the bill - bill_text, bill_title, mgl_ref, commottee_info, mgl_names - query (str): Query string containing details on what model has to do. - llm_call_type (str): This argument can take 2 values ("small": No use of RAG, "large": Use RAG) + query (str): Query string containing details on what model has to do. + llm_call_type (str): This argument can take 2 values ("small": No use of RAG, "large": Use RAG) - Returns: + Returns: LLMResults: Object containing query, response (Raw unformatted response from model) and metrics (If requested) """ - llm = ChatOpenAI(temperature = 0, model = GPT_MDOEL_VERSION, model_kwargs = {'seed': 42}) + llm = ChatOpenAI(temperature=0, model=GPT_MDOEL_VERSION, model_kwargs={"seed": 42}) - if llm_call_type == 'small': + if llm_call_type == "small": response = small_docs(bill_details, query, llm) - else: + else: response = large_docs(bill_details, query, llm) - return_obj = LLMResults(query = query, response = response) + return_obj = LLMResults(query=query, response=response) return return_obj + def small_docs(bill_details: BillDetails, query: str, llm: ChatOpenAI) -> str: """ - + This function calls the LLM without using RAG - Generally used if token count is less than 128k - Args: + Args: bill_details (BillDetails): Object containing information about the bill - bill_text, bill_title, mgl_ref, commottee_info, mgl_names - query (str): Query string containing details on what model has to do. + query (str): Query string containing details on what model has to do. llm (ChatOpenAI): LLM call object - Returns: - (str): Raw response of the LLM. + Returns: + (str): Raw response of the LLM. """ prompt = PromptTemplate.from_template(query) chain = create_stuff_documents_chain(llm, prompt) - with get_openai_callback() as cb: + with get_openai_callback() as _cb: response = chain.invoke(bill_details.invoke_dict) return response + def format_docs(docs): return "\n\n".join(doc.page_content for doc in docs) -def get_or_create_embeddings(bill_details: BillDetails, emb_api: OpenAIEmbeddings) -> chromadb.PersistentClient: +def get_or_create_embeddings( + bill_details: BillDetails, emb_api: OpenAIEmbeddings +) -> chromadb.PersistentClient: """ Retrieves existing embeddings or creates new ones for a given bill. @@ -952,8 +1021,8 @@ def get_or_create_embeddings(bill_details: BillDetails, emb_api: OpenAIEmbedding bill_id = bill_details.bill_id client = chromadb.PersistentClient(CHROMA_DB_PATH) - collection = client.get_or_create_collection(name = "bills_collection") - + collection = client.get_or_create_collection(name="bills_collection") + existing_docs = collection.get(where={"bill_id": bill_id}) if not existing_docs["ids"]: @@ -966,8 +1035,7 @@ def get_or_create_embeddings(bill_details: BillDetails, emb_api: OpenAIEmbedding # the model's context window, as opposed to larger chunks that might contain # irrelevant information and waste context space. text_splitter = TokenTextSplitter.from_tiktoken_encoder( - chunk_size = 2000, - chunk_overlap = 200 + chunk_size=2000, chunk_overlap=200 ) if len(bill_details.mgl_ref) > 1e06: @@ -978,47 +1046,51 @@ def get_or_create_embeddings(bill_details: BillDetails, emb_api: OpenAIEmbedding embeddings = emb_api.embed_documents(documents) collection.add( - documents = documents, - embeddings = embeddings, - metadatas = [{"bill_id": bill_id} for _ in documents], - ids = [f"{bill_id}_{i}" for i in range(len(documents))] + documents=documents, + embeddings=embeddings, + metadatas=[{"bill_id": bill_id} for _ in documents], + ids=[f"{bill_id}_{i}" for i in range(len(documents))], ) return client + def large_docs(bill_details: BillDetails, query: str, llm: ChatOpenAI) -> str: """ - + This function calls the LLM using RAG - Generally used if token count is greater than 128k - Args: + Args: bill_details (BillDetails): Object containing information about the bill - bill_text, bill_title, mgl_ref, commottee_info, mgl_names - query (str): Query string containing details on what model has to do. + query (str): Query string containing details on what model has to do. llm (ChatOpenAI): LLM call object - Returns: - (str): Raw response of the LLM. + Returns: + (str): Raw response of the LLM. """ emb_api = OpenAIEmbeddings() chroma_client = get_or_create_embeddings(bill_details, emb_api) - + vectorstore = Chroma( client=chroma_client, collection_name="bills_collection", - embedding_function=emb_api + embedding_function=emb_api, + ) + + retrieval_doc_count = min( + (MAX_TOKEN_LIMIT - count_tokens("", bill_details.bill_text, "", "", "")) // 2000 + - 2, + 7, ) - retrieval_doc_count = min((MAX_TOKEN_LIMIT - count_tokens('', bill_details.bill_text, '', '', ''))//2000 - 2, 7) - retriever = vectorstore.as_retriever( - search_type="similarity", + search_type="similarity", search_kwargs={ "k": retrieval_doc_count, - "filter": {"bill_id": bill_details.bill_id} - } + "filter": {"bill_id": bill_details.bill_id}, + }, ) - # retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 12}) @@ -1038,10 +1110,10 @@ def large_docs(bill_details: BillDetails, query: str, llm: ChatOpenAI) -> str: | StrOutputParser() ) - try: - with get_openai_callback() as cb: + try: + with get_openai_callback() as _cb: response = rag_chain.invoke(query) - except Exception as e: + except Exception as e: print(e) return response diff --git a/llm/requirements.txt b/llm/requirements.txt index 92dbdd75b..d76b88efa 100644 --- a/llm/requirements.txt +++ b/llm/requirements.txt @@ -11,6 +11,7 @@ pandas==2.2.2 pytest==8.3.5 requests==2.32.3 rouge_score==0.1.2 +ruff==0.14.5 scikit-learn==1.5.0 streamlit==1.35.0 tiktoken==0.7.0 diff --git a/llm/tag_categories.py b/llm/tag_categories.py index ac6023b27..8f473d4e7 100644 --- a/llm/tag_categories.py +++ b/llm/tag_categories.py @@ -248,247 +248,536 @@ } """ -category_for_bill_v2 = ['Commerce', 'Crime and Law Enforcement', 'Economics and Public Finance', - 'Education', 'Emergency Management', 'Energy', 'Environmental Protection', 'Families', - 'Government Operations and Politics', 'Healthcare', 'Food, Drugs and Alcohol', - 'Housing and Community Development', 'Immigration', 'Labor and Employment', 'Law and Judiciary', - 'Public and Natural Resources', 'Science, Technology, Communications', 'Social Services', - 'Sports and Recreation', 'Taxation', 'Transportation and Public Works'] - -tags_for_bill_v2 = {'Commerce': ['Banking and financial institutions regulation', - 'Business ethics', - 'Competition and antitrust', - 'Consumer affairs', - 'Corporate finance and management', - 'Marketing and advertising', - 'Retail and wholesale trades', - 'Securities'], - - 'Crime and Law Enforcement': ['Assault and harassment offenses', - 'Crimes against animals and natural resources', - 'Crimes against children', - 'Property Crimes', - 'Criminal investigation, prosecution, interrogation', - 'Criminal justice information and records', - 'Criminal procedure and sentencing', - 'Firearms and explosives', - 'Fraud offenses and financial crimes', - 'Correctional Facilities', - 'Criminal Justice Reform'], - - 'Economics and Public Finance': ['Budget process', - 'Debt collection', - 'Economic development', - 'Economic theory', - 'Employment taxes', - 'Financial crises and stabilization', - 'Financial literacy', - 'Financial services and investments', - 'Interest, dividends, interest rates', - 'Labor-management relations', - 'Public contracts and procurement', - 'Pension and retirement benefits'], - - 'Education': ['Academic performance and assessments', - 'Adult education and literacy', - 'Educational facilities and institutions', - 'Elementary and secondary education', - 'Higher education', - 'Curriculum and standards', - 'Special education', - 'Student aid and college costs', - 'Teachers and educators', - 'Technology assessment', - 'Vocational and technical education'], - - 'Emergency Management': ['Accidents', - 'Disaster relief and insurance', - 'Emergency communications systems', - 'Emergency medical services and trauma care', - 'Emergency planning and evacuation', - 'Hazards and emergency operations', - 'Search and rescue operations'], - - 'Energy': ['Energy assistance', - 'Energy efficiency and conservation', - 'Energy prices', - 'Energy research', - 'Energy revenues and royalties', - 'Energy storage, supplies, demand', - 'Renewable energy sources'], - - 'Environmental Protection': ['Air quality', - 'Environmental assessment, monitoring, research', - 'Environmental education', - 'Environmental health', - 'Environmental regulatory procedures', - 'Hazardous wastes and toxic substances', - 'Pollution control and abatement', - 'Soil pollution', - 'Solid waste and recycling', - 'Water quality', - 'Wetlands'], - - 'Families': ['Adoption and foster care', - 'Family planning and birth control', - 'Family relationships and status', - 'Family services', - 'Parenting'], - - 'Government Operations and Politics': ['Census and government statistics', - 'Election administration', - 'Government ethics and transparency', - 'Government information and archives', - 'Government studies and investigations', - 'Government trust funds', - 'Legislative rules and procedure', - 'Lobbying and campaign finance', - 'Political advertising', - 'Political representation', - 'public-private cooperation'], - - 'Healthcare': ['Alternative treatments', - 'Telehealth', - 'Public Health', - 'Patient Care', - 'Dental care', - 'Health care costs', - 'Health insurance and coverage', - 'Health facilities and institutions', - 'Health information and medical records', - 'Health technology, devices, supplies', - 'Substance use disorder', - 'Healthcare workforce', - 'Medical research', - 'Mental health', - 'Prescription drugs', - 'Healthcare oversight', - 'Sex and reproductive health'], - - 'Food, Drugs and Alcohol': ['Alcoholic beverages', - 'Drug, alcohol, tobacco use', - 'Food industry and services', - 'Food supply, safety, and labeling', - 'Nutrition and diet', - 'Food service employment', - 'Drug safety, medical device, and laboratory regulation', - 'Nutrition and diet'], - - 'Housing and Community Development': ['Community life and organization', - 'Cooperative and condominium housing', - 'Homelessness and emergency shelter', - 'Housing discrimination', - 'Housing finance and home ownership', - 'Housing for the elderly and disabled', - 'Housing industry and standards', - 'Housing supply and affordability', - 'Landlord and tenant', - 'Low- and moderate-income housing', - 'Residential rehabilitation and home repair'], - - 'Immigration': ['Citizenship and naturalization', - 'Immigrant health and welfare', - 'Immigration status and procedures', - 'Language services', - 'Refugees, asylum, displaced persons', - 'Right to shelter'], - - 'Labor and Employment': ['Employee benefits and pensions', - 'Employee hiring', - 'Employee leave', - 'Employee performance', - 'Employment and training programs', - 'Employment discrimination and employee rights', - 'Labor standards', - 'Migrant, seasonal, agricultural labor', - 'Self-employment', - 'Temporary and part-time employment', - "Workers' compensation", - 'Worker safety and health', - 'Youth employment and child labor'], - - 'Law and Judiciary': ['Administrative law and regulatory procedures', - 'Administrative remedies', - 'Civil actions and liability', - 'Civil disturbances', - 'Evidence and witnesses', - 'Judicial procedure and administration', - 'Judicial review and appeals', - 'Jurisdiction and venue', - 'Legal fees and court costs', - 'Property rights'], - - 'Public and Natural Resources': ['Forests, forestry, trees', - 'General public lands matters', - 'Marine and coastal resources, fisheries', - 'Marine pollution', - 'Monuments and memorials', - 'Water resources', - 'Wilderness'], - - 'Science, Technology, Communications': ['Advanced technology and technological innovations', - 'Atmospheric science and weather', - 'Computer security and identity theft', - 'Computers and information technology', - 'Earth sciences', - 'Ecology', - 'Environmental technology', - 'Genetics', - 'Internet, web applications, social media', - 'Photography and imaging', - 'Radio spectrum allocation', - 'Telecommunication rates and fees', - 'Telephone and wireless communication', - 'Television and film'], - - 'Social Services': ['Child care and development', - 'Domestic violence and child abuse', - 'Food assistance and relief', - 'Home and outpatient care', - 'Social work, volunteer service, charitable organizations', - 'Unemployment', - 'Urban and suburban affairs and development', - "Veterans' education, employment, rehabilitation", - "Veterans' loans, housing, homeless programs", - "Veterans' medical care"], - 'Sports and Recreation': ['Athletics', - 'Games and hobbies', - 'Hunting and fishing', - 'Outdoor recreation', - 'Parks, recreation areas, trails', - 'Performing arts', - 'Professional sports', - 'Sports and recreation facilities'], - 'Taxation': ['Capital gains tax', - 'Corporate tax', - 'Estate tax', - 'Excise tax', - 'Gift tax', - 'Income tax', - 'Payroll tax', - 'Property tax', - 'Sales tax', - 'Tariffs', - 'Transfer and inheritance taxes', - 'Tax-exempt organizations'], - 'Transportation and Public Works': ['Aviation and airports', - 'Highways and roads', - 'Maritime affairs and fisheries', - 'MBTA & Public Transportation', - 'Public utilities and utility rates', - 'Railroads', - 'Transportation safety and security', - 'Water storage', - 'Water use and supply']} +category_for_bill_v2 = [ + "Commerce", + "Crime and Law Enforcement", + "Economics and Public Finance", + "Education", + "Emergency Management", + "Energy", + "Environmental Protection", + "Families", + "Government Operations and Politics", + "Healthcare", + "Food, Drugs and Alcohol", + "Housing and Community Development", + "Immigration", + "Labor and Employment", + "Law and Judiciary", + "Public and Natural Resources", + "Science, Technology, Communications", + "Social Services", + "Sports and Recreation", + "Taxation", + "Transportation and Public Works", +] + +tags_for_bill_v2 = { + "Commerce": [ + "Banking and financial institutions regulation", + "Business ethics", + "Competition and antitrust", + "Consumer affairs", + "Corporate finance and management", + "Marketing and advertising", + "Retail and wholesale trades", + "Securities", + ], + "Crime and Law Enforcement": [ + "Assault and harassment offenses", + "Crimes against animals and natural resources", + "Crimes against children", + "Property Crimes", + "Criminal investigation, prosecution, interrogation", + "Criminal justice information and records", + "Criminal procedure and sentencing", + "Firearms and explosives", + "Fraud offenses and financial crimes", + "Correctional Facilities", + "Criminal Justice Reform", + ], + "Economics and Public Finance": [ + "Budget process", + "Debt collection", + "Economic development", + "Economic theory", + "Employment taxes", + "Financial crises and stabilization", + "Financial literacy", + "Financial services and investments", + "Interest, dividends, interest rates", + "Labor-management relations", + "Public contracts and procurement", + "Pension and retirement benefits", + ], + "Education": [ + "Academic performance and assessments", + "Adult education and literacy", + "Educational facilities and institutions", + "Elementary and secondary education", + "Higher education", + "Curriculum and standards", + "Special education", + "Student aid and college costs", + "Teachers and educators", + "Technology assessment", + "Vocational and technical education", + ], + "Emergency Management": [ + "Accidents", + "Disaster relief and insurance", + "Emergency communications systems", + "Emergency medical services and trauma care", + "Emergency planning and evacuation", + "Hazards and emergency operations", + "Search and rescue operations", + ], + "Energy": [ + "Energy assistance", + "Energy efficiency and conservation", + "Energy prices", + "Energy research", + "Energy revenues and royalties", + "Energy storage, supplies, demand", + "Renewable energy sources", + ], + "Environmental Protection": [ + "Air quality", + "Environmental assessment, monitoring, research", + "Environmental education", + "Environmental health", + "Environmental regulatory procedures", + "Hazardous wastes and toxic substances", + "Pollution control and abatement", + "Soil pollution", + "Solid waste and recycling", + "Water quality", + "Wetlands", + ], + "Families": [ + "Adoption and foster care", + "Family planning and birth control", + "Family relationships and status", + "Family services", + "Parenting", + ], + "Government Operations and Politics": [ + "Census and government statistics", + "Election administration", + "Government ethics and transparency", + "Government information and archives", + "Government studies and investigations", + "Government trust funds", + "Legislative rules and procedure", + "Lobbying and campaign finance", + "Political advertising", + "Political representation", + "public-private cooperation", + ], + "Healthcare": [ + "Alternative treatments", + "Telehealth", + "Public Health", + "Patient Care", + "Dental care", + "Health care costs", + "Health insurance and coverage", + "Health facilities and institutions", + "Health information and medical records", + "Health technology, devices, supplies", + "Substance use disorder", + "Healthcare workforce", + "Medical research", + "Mental health", + "Prescription drugs", + "Healthcare oversight", + "Sex and reproductive health", + ], + "Food, Drugs and Alcohol": [ + "Alcoholic beverages", + "Drug, alcohol, tobacco use", + "Food industry and services", + "Food supply, safety, and labeling", + "Nutrition and diet", + "Food service employment", + "Drug safety, medical device, and laboratory regulation", + "Nutrition and diet", + ], + "Housing and Community Development": [ + "Community life and organization", + "Cooperative and condominium housing", + "Homelessness and emergency shelter", + "Housing discrimination", + "Housing finance and home ownership", + "Housing for the elderly and disabled", + "Housing industry and standards", + "Housing supply and affordability", + "Landlord and tenant", + "Low- and moderate-income housing", + "Residential rehabilitation and home repair", + ], + "Immigration": [ + "Citizenship and naturalization", + "Immigrant health and welfare", + "Immigration status and procedures", + "Language services", + "Refugees, asylum, displaced persons", + "Right to shelter", + ], + "Labor and Employment": [ + "Employee benefits and pensions", + "Employee hiring", + "Employee leave", + "Employee performance", + "Employment and training programs", + "Employment discrimination and employee rights", + "Labor standards", + "Migrant, seasonal, agricultural labor", + "Self-employment", + "Temporary and part-time employment", + "Workers' compensation", + "Worker safety and health", + "Youth employment and child labor", + ], + "Law and Judiciary": [ + "Administrative law and regulatory procedures", + "Administrative remedies", + "Civil actions and liability", + "Civil disturbances", + "Evidence and witnesses", + "Judicial procedure and administration", + "Judicial review and appeals", + "Jurisdiction and venue", + "Legal fees and court costs", + "Property rights", + ], + "Public and Natural Resources": [ + "Forests, forestry, trees", + "General public lands matters", + "Marine and coastal resources, fisheries", + "Marine pollution", + "Monuments and memorials", + "Water resources", + "Wilderness", + ], + "Science, Technology, Communications": [ + "Advanced technology and technological innovations", + "Atmospheric science and weather", + "Computer security and identity theft", + "Computers and information technology", + "Earth sciences", + "Ecology", + "Environmental technology", + "Genetics", + "Internet, web applications, social media", + "Photography and imaging", + "Radio spectrum allocation", + "Telecommunication rates and fees", + "Telephone and wireless communication", + "Television and film", + ], + "Social Services": [ + "Child care and development", + "Domestic violence and child abuse", + "Food assistance and relief", + "Home and outpatient care", + "Social work, volunteer service, charitable organizations", + "Unemployment", + "Urban and suburban affairs and development", + "Veterans' education, employment, rehabilitation", + "Veterans' loans, housing, homeless programs", + "Veterans' medical care", + ], + "Sports and Recreation": [ + "Athletics", + "Games and hobbies", + "Hunting and fishing", + "Outdoor recreation", + "Parks, recreation areas, trails", + "Performing arts", + "Professional sports", + "Sports and recreation facilities", + ], + "Taxation": [ + "Capital gains tax", + "Corporate tax", + "Estate tax", + "Excise tax", + "Gift tax", + "Income tax", + "Payroll tax", + "Property tax", + "Sales tax", + "Tariffs", + "Transfer and inheritance taxes", + "Tax-exempt organizations", + ], + "Transportation and Public Works": [ + "Aviation and airports", + "Highways and roads", + "Maritime affairs and fisheries", + "MBTA & Public Transportation", + "Public utilities and utility rates", + "Railroads", + "Transportation safety and security", + "Water storage", + "Water use and supply", + ], +} -new_categories_for_bill_list = ['Commerce', 'Crime and Law Enforcement', 'Economics and Public Finance', 'Education', - 'Emergency Management', 'Energy', 'Environmental Protection', 'Families', 'Government Operations and Politics', - 'Healthcare', 'Food, Drugs and Alcohol', 'Housing and Community Development', 'Immigrants and Foreign Nationals', - 'Labor and Employment', 'Law and Judiciary', 'Public and Natural Resources', 'Science, Technology, Communications', - 'Social Services', 'Sports and Recreation', 'Taxation', 'Transportation and Public Works'] +new_categories_for_bill_list = [ + "Commerce", + "Crime and Law Enforcement", + "Economics and Public Finance", + "Education", + "Emergency Management", + "Energy", + "Environmental Protection", + "Families", + "Government Operations and Politics", + "Healthcare", + "Food, Drugs and Alcohol", + "Housing and Community Development", + "Immigrants and Foreign Nationals", + "Labor and Employment", + "Law and Judiciary", + "Public and Natural Resources", + "Science, Technology, Communications", + "Social Services", + "Sports and Recreation", + "Taxation", + "Transportation and Public Works", +] new_tags_for_bill_dict = { "Commerce": [ + "Banking and financial institutions regulation", + "Partnerships and Limited Liability Companies", + "Non-Profit Law and Governance", + "Consumer Protection", + "Corporation Law and Governance", + "Marketing and advertising", + "Retail and wholesale trades", + "Securities", + ], + "Crime and Law Enforcement": [ + "Assault and harassment offenses", + "Crimes against animals and natural resources", + "Crimes against children", + "Property Crimes", + "Criminal investigation, prosecution, interrogation", + "Criminal justice information and records", + "Criminal Sentencing", + "Firearms and explosives", + "Fraud offenses and financial crimes", + "Correctional Facilities", + "Criminal Justice Reform", + ], + "Economics and Public Finance": [ + "Budget process", + "Debt collection", + "Financial literacy", + "Financial services and investments", + "Labor-management relations", + "Public contracts and procurement", + "Pension and retirement benefits", + ], + "Education": [ + "Academic performance and assessments", + "Adult education and literacy", + "Educational facilities and institutions", + "Elementary and secondary education", + "Higher education", + "Curriculum and standards", + "Special education", + "Student aid and college costs", + "Teachers and educators", + "Technology assessment", + "Vocational and technical education", + ], + "Emergency Management": [ + "Disaster relief and insurance", + "Emergency communications systems", + "Emergency medical services and trauma care", + "Emergency planning and evacuation", + "Hazards and emergency operations", + ], + "Energy": [ + "Energy assistance", + "Energy efficiency and conservation", + "Energy prices", + "Energy research", + "Energy storage, supplies, demand", + "Renewable energy sources", + ], + "Environmental Protection": [ + "Air quality", + "Environmental assessment, monitoring, research", + "Environmental education", + "Environmental health", + "Environmental regulatory procedures", + "Hazardous wastes and toxic substances", + "Pollution control and abatement", + "Soil pollution", + "Solid waste and recycling", + "Water quality", + "Wetlands", + ], + "Families": [ + "Adoption and foster care", + "Family planning and birth control", + "Family relationships and status", + "Family services", + "Parenting", + ], + "Government Operations and Politics": [ + "Census and government statistics", + "Election administration", + "Municipality Oversight ", + "Government information and archives", + "Government studies and investigations", + "Government trust funds", + "Lobbying and campaign finance", + "Political advertising", + "Public-private cooperation", + ], + "Healthcare": [ + "Alternative treatments", + "Telehealth", + "Veterinary Services and Pets", + "Dental care", + "Health care costs", + "Health insurance and coverage", + "Health facilities and institutions", + "Health information and medical records", + "Health technology, devices, supplies", + "Substance use disorder", + "Healthcare workforce", + "Medical research", + "Mental health", + "Prescription drugs", + "Sex and reproductive health", + ], + "Food, Drugs and Alcohol": [ + "Alcoholic beverages and licenses", + "Drug, alcohol, tobacco use", + "Food industry and services", + "Food supply, safety, and labeling", + "Nutrition and diet", + "Food service employment", + "Drug safety, medical device, and laboratory regulation", + ], + "Housing and Community Development": [ + "Community life and organization", + "Cooperative and condominium housing", + "Homelessness and emergency shelter", + "Housing discrimination", + "Housing finance and home ownership", + "Housing for the elderly and disabled", + "Housing industry and standards", + "Housing supply and affordability", + "Landlord and tenant", + "Low- and moderate-income housing", + "Residential rehabilitation and home repair", + ], + "Immigrants and Foreign Nationals": [ + "Immigrant health and welfare", + "Translation and language services", + "Refugees, asylum, displaced persons", + "Right to shelter", + ], + "Labor and Employment": [ + "Employee benefits", + "Employee pensions", + "Employee leave", + "Employee performance", + "Employment and training programs", + "Employment discrimination", + "Migrant, seasonal, agricultural labor", + "Self-employment", + "Temporary and part-time employment", + "Workers' compensation", + "Worker safety and health", + "Youth employment and child labor", + ], + "Law and Judiciary": [ + "Administrative remedies", + "Civil actions and liability", + "Civil disturbances", + "Evidence and witnesses", + "Judicial administration", + "Judicial review and appeals", + "Jurisdiction and venue", + "Legal fees and court costs", + "Property rights", + ], + "Public and Natural Resources": [ + "Forests, forestry, trees", + "Eminent domain", + "Marine and coastal resources, fisheries", + "Marine pollution", + "Monuments and memorials", + "Water resources", + "Wilderness", + ], + "Science, Technology, Communications": [ + "Advanced technology and technological innovations", + "Atmospheric science and weather", + "Computer security and identity theft", + "Computers and information technology", + "Genetics", + "Internet, web applications, social media", + "Photography and imaging", + "Telecommunication rates and fees", + "Telephone and wireless communication", + ], + "Social Services": [ + "Child care and development", + "Domestic violence and child abuse", + "Food assistance and relief", + "Home and outpatient care", + "Social work, volunteer service, charitable organizations", + "Unemployment", + "Urban and suburban affairs and development", + "Veterans' education, employment, rehabilitation", + "Veterans' loans, housing, homeless programs", + "Veterans' medical care", + ], + "Sports and Recreation": [ + "Art and culture", + "Hunting and fishing", + "Outdoor recreation", + "Public parks", + "Gambling and lottery", + "Professional sports, stadiums and arenas", + "Sports and recreation facilities", + ], + "Taxation": [ + "Capital gains tax", + "Corporate tax", + "Estate tax", + "Excise tax", + "Gift tax", + "Income tax", + "Payroll and employment tax", + "Property tax", + "Sales tax", + "Transfer and inheritance taxes", + "Tax-exempt organizations", + ], + "Transportation and Public Works": [ + "Aviation and airports", + "Highways and roads", + "Maritime affairs and fisheries", + "MBTA & Public Transportation", + "Public utilities and utility rates", + "Railroads", + "Water storage", + "Water use and supply", + ], +} + +new_tags = [ "Banking and financial institutions regulation", "Partnerships and Limited Liability Companies", "Non-Profit Law and Governance", @@ -496,9 +785,7 @@ "Corporation Law and Governance", "Marketing and advertising", "Retail and wholesale trades", - "Securities" - ], - "Crime and Law Enforcement": [ + "Securities", "Assault and harassment offenses", "Crimes against animals and natural resources", "Crimes against children", @@ -509,18 +796,14 @@ "Firearms and explosives", "Fraud offenses and financial crimes", "Correctional Facilities", - "Criminal Justice Reform" - ], - "Economics and Public Finance": [ + "Criminal Justice Reform", "Budget process", "Debt collection", "Financial literacy", "Financial services and investments", "Labor-management relations", "Public contracts and procurement", - "Pension and retirement benefits" - ], - "Education": [ + "Pension and retirement benefits", "Academic performance and assessments", "Adult education and literacy", "Educational facilities and institutions", @@ -531,24 +814,18 @@ "Student aid and college costs", "Teachers and educators", "Technology assessment", - "Vocational and technical education" - ], - "Emergency Management": [ + "Vocational and technical education", "Disaster relief and insurance", "Emergency communications systems", "Emergency medical services and trauma care", "Emergency planning and evacuation", - "Hazards and emergency operations" - ], - "Energy": [ + "Hazards and emergency operations", "Energy assistance", "Energy efficiency and conservation", "Energy prices", "Energy research", "Energy storage, supplies, demand", - "Renewable energy sources" - ], - "Environmental Protection": [ + "Renewable energy sources", "Air quality", "Environmental assessment, monitoring, research", "Environmental education", @@ -559,16 +836,12 @@ "Soil pollution", "Solid waste and recycling", "Water quality", - "Wetlands" - ], - "Families": [ + "Wetlands", "Adoption and foster care", "Family planning and birth control", "Family relationships and status", "Family services", - "Parenting" - ], - "Government Operations and Politics": [ + "Parenting", "Census and government statistics", "Election administration", "Municipality Oversight ", @@ -577,9 +850,7 @@ "Government trust funds", "Lobbying and campaign finance", "Political advertising", - "Public-private cooperation" - ], - "Healthcare": [ + "Public-private cooperation", "Alternative treatments", "Telehealth", "Veterinary Services and Pets", @@ -594,18 +865,14 @@ "Medical research", "Mental health", "Prescription drugs", - "Sex and reproductive health" - ], - "Food, Drugs and Alcohol": [ + "Sex and reproductive health", "Alcoholic beverages and licenses", "Drug, alcohol, tobacco use", "Food industry and services", "Food supply, safety, and labeling", "Nutrition and diet", "Food service employment", - "Drug safety, medical device, and laboratory regulation" - ], - "Housing and Community Development": [ + "Drug safety, medical device, and laboratory regulation", "Community life and organization", "Cooperative and condominium housing", "Homelessness and emergency shelter", @@ -616,15 +883,11 @@ "Housing supply and affordability", "Landlord and tenant", "Low- and moderate-income housing", - "Residential rehabilitation and home repair" - ], - "Immigrants and Foreign Nationals": [ + "Residential rehabilitation and home repair", "Immigrant health and welfare", "Translation and language services", "Refugees, asylum, displaced persons", - "Right to shelter" - ], - "Labor and Employment": [ + "Right to shelter", "Employee benefits", "Employee pensions", "Employee leave", @@ -636,9 +899,7 @@ "Temporary and part-time employment", "Workers' compensation", "Worker safety and health", - "Youth employment and child labor" - ], - "Law and Judiciary": [ + "Youth employment and child labor", "Administrative remedies", "Civil actions and liability", "Civil disturbances", @@ -647,18 +908,14 @@ "Judicial review and appeals", "Jurisdiction and venue", "Legal fees and court costs", - "Property rights" - ], - "Public and Natural Resources": [ + "Property rights", "Forests, forestry, trees", "Eminent domain", "Marine and coastal resources, fisheries", "Marine pollution", "Monuments and memorials", "Water resources", - "Wilderness" - ], - "Science, Technology, Communications": [ + "Wilderness", "Advanced technology and technological innovations", "Atmospheric science and weather", "Computer security and identity theft", @@ -667,9 +924,7 @@ "Internet, web applications, social media", "Photography and imaging", "Telecommunication rates and fees", - "Telephone and wireless communication" - ], - "Social Services": [ + "Telephone and wireless communication", "Child care and development", "Domestic violence and child abuse", "Food assistance and relief", @@ -679,18 +934,14 @@ "Urban and suburban affairs and development", "Veterans' education, employment, rehabilitation", "Veterans' loans, housing, homeless programs", - "Veterans' medical care" - ], - "Sports and Recreation": [ + "Veterans' medical care", "Art and culture", "Hunting and fishing", "Outdoor recreation", "Public parks", "Gambling and lottery", "Professional sports, stadiums and arenas", - "Sports and recreation facilities" - ], - "Taxation": [ + "Sports and recreation facilities", "Capital gains tax", "Corporate tax", "Estate tax", @@ -701,9 +952,7 @@ "Property tax", "Sales tax", "Transfer and inheritance taxes", - "Tax-exempt organizations" - ], - "Transportation and Public Works": [ + "Tax-exempt organizations", "Aviation and airports", "Highways and roads", "Maritime affairs and fisheries", @@ -711,191 +960,5 @@ "Public utilities and utility rates", "Railroads", "Water storage", - "Water use and supply" - ] -} - -new_tags = ['Banking and financial institutions regulation', - 'Partnerships and Limited Liability Companies', - 'Non-Profit Law and Governance', - 'Consumer Protection', - 'Corporation Law and Governance', - 'Marketing and advertising', - 'Retail and wholesale trades', - 'Securities', - 'Assault and harassment offenses', - 'Crimes against animals and natural resources', - 'Crimes against children', - 'Property Crimes', - 'Criminal investigation, prosecution, interrogation', - 'Criminal justice information and records', - 'Criminal Sentencing', - 'Firearms and explosives', - 'Fraud offenses and financial crimes', - 'Correctional Facilities', - 'Criminal Justice Reform', - 'Budget process', - 'Debt collection', - 'Financial literacy', - 'Financial services and investments', - 'Labor-management relations', - 'Public contracts and procurement', - 'Pension and retirement benefits', - 'Academic performance and assessments', - 'Adult education and literacy', - 'Educational facilities and institutions', - 'Elementary and secondary education', - 'Higher education', - 'Curriculum and standards', - 'Special education', - 'Student aid and college costs', - 'Teachers and educators', - 'Technology assessment', - 'Vocational and technical education', - 'Disaster relief and insurance', - 'Emergency communications systems', - 'Emergency medical services and trauma care', - 'Emergency planning and evacuation', - 'Hazards and emergency operations', - 'Energy assistance', - 'Energy efficiency and conservation', - 'Energy prices', - 'Energy research', - 'Energy storage, supplies, demand', - 'Renewable energy sources', - 'Air quality', - 'Environmental assessment, monitoring, research', - 'Environmental education', - 'Environmental health', - 'Environmental regulatory procedures', - 'Hazardous wastes and toxic substances', - 'Pollution control and abatement', - 'Soil pollution', - 'Solid waste and recycling', - 'Water quality', - 'Wetlands', - 'Adoption and foster care', - 'Family planning and birth control', - 'Family relationships and status', - 'Family services', - 'Parenting', - 'Census and government statistics', - 'Election administration', - 'Municipality Oversight ', - 'Government information and archives', - 'Government studies and investigations', - 'Government trust funds', - 'Lobbying and campaign finance', - 'Political advertising', - 'Public-private cooperation', - 'Alternative treatments', - 'Telehealth', - 'Veterinary Services and Pets', - 'Dental care', - 'Health care costs', - 'Health insurance and coverage', - 'Health facilities and institutions', - 'Health information and medical records', - 'Health technology, devices, supplies', - 'Substance use disorder', - 'Healthcare workforce', - 'Medical research', - 'Mental health', - 'Prescription drugs', - 'Sex and reproductive health', - 'Alcoholic beverages and licenses', - 'Drug, alcohol, tobacco use', - 'Food industry and services', - 'Food supply, safety, and labeling', - 'Nutrition and diet', - 'Food service employment', - 'Drug safety, medical device, and laboratory regulation', - 'Community life and organization', - 'Cooperative and condominium housing', - 'Homelessness and emergency shelter', - 'Housing discrimination', - 'Housing finance and home ownership', - 'Housing for the elderly and disabled', - 'Housing industry and standards', - 'Housing supply and affordability', - 'Landlord and tenant', - 'Low- and moderate-income housing', - 'Residential rehabilitation and home repair', - 'Immigrant health and welfare', - 'Translation and language services', - 'Refugees, asylum, displaced persons', - 'Right to shelter', - 'Employee benefits', - 'Employee pensions', - 'Employee leave', - 'Employee performance', - 'Employment and training programs', - 'Employment discrimination', - 'Migrant, seasonal, agricultural labor', - 'Self-employment', - 'Temporary and part-time employment', - "Workers' compensation", - 'Worker safety and health', - 'Youth employment and child labor', - 'Administrative remedies', - 'Civil actions and liability', - 'Civil disturbances', - 'Evidence and witnesses', - 'Judicial administration', - 'Judicial review and appeals', - 'Jurisdiction and venue', - 'Legal fees and court costs', - 'Property rights', - 'Forests, forestry, trees', - 'Eminent domain', - 'Marine and coastal resources, fisheries', - 'Marine pollution', - 'Monuments and memorials', - 'Water resources', - 'Wilderness', - 'Advanced technology and technological innovations', - 'Atmospheric science and weather', - 'Computer security and identity theft', - 'Computers and information technology', - 'Genetics', - 'Internet, web applications, social media', - 'Photography and imaging', - 'Telecommunication rates and fees', - 'Telephone and wireless communication', - 'Child care and development', - 'Domestic violence and child abuse', - 'Food assistance and relief', - 'Home and outpatient care', - 'Social work, volunteer service, charitable organizations', - 'Unemployment', - 'Urban and suburban affairs and development', - "Veterans' education, employment, rehabilitation", - "Veterans' loans, housing, homeless programs", - "Veterans' medical care", - 'Art and culture', - 'Hunting and fishing', - 'Outdoor recreation', - 'Public parks', - 'Gambling and lottery', - 'Professional sports, stadiums and arenas', - 'Sports and recreation facilities', - 'Capital gains tax', - 'Corporate tax', - 'Estate tax', - 'Excise tax', - 'Gift tax', - 'Income tax', - 'Payroll and employment tax', - 'Property tax', - 'Sales tax', - 'Transfer and inheritance taxes', - 'Tax-exempt organizations', - 'Aviation and airports', - 'Highways and roads', - 'Maritime affairs and fisheries', - 'MBTA & Public Transportation', - 'Public utilities and utility rates', - 'Railroads', - 'Water storage', - 'Water use and supply' - ] \ No newline at end of file + "Water use and supply", +] diff --git a/llm/test_bill_on_document_created.py b/llm/test_bill_on_document_created.py index 7414614ef..49d826de7 100644 --- a/llm/test_bill_on_document_created.py +++ b/llm/test_bill_on_document_created.py @@ -3,7 +3,6 @@ get_categories_from_topics, TopicAndCategory, ) -from collections import deque def test_get_categories_from_topics_empty():