From 55c5e210753aa6eda06cb6bf98ca08a0e11ddc36 Mon Sep 17 00:00:00 2001 From: Geetha Date: Wed, 22 Apr 2026 19:44:08 -0700 Subject: [PATCH 1/3] Complete Task 1-4: Pandas DataFrames and Cleaning --- assignment4/additional_employees.json | 4 + assignment4/assignment4.py | 186 ++++++++++++++++++++++++++ assignment4/employees.csv | 4 + 3 files changed, 194 insertions(+) create mode 100644 assignment4/additional_employees.json create mode 100644 assignment4/employees.csv diff --git a/assignment4/additional_employees.json b/assignment4/additional_employees.json new file mode 100644 index 0000000..aa8a70f --- /dev/null +++ b/assignment4/additional_employees.json @@ -0,0 +1,4 @@ +[ + {"Name": "Eve", "Age": 28, "City": "Miami", "Salary": 60000}, + {"Name": "Frank", "Age": 40, "City": "Seattle", "Salary": 95000} +] diff --git a/assignment4/assignment4.py b/assignment4/assignment4.py index e69de29..c3c7369 100644 --- a/assignment4/assignment4.py +++ b/assignment4/assignment4.py @@ -0,0 +1,186 @@ +import pandas as pd + +# Task 1 - Create a DataFrame from a dictionary +# Create dictionary +data = { + 'Name': ['Alice', 'Bob', 'Charlie'], + 'Age': [25, 30, 35], + 'City': ['New York', 'Los Angeles', 'Chicago'] +} + +# Convert the dictionary into a DataFrame using Pandas. +task1_data_frame= pd.DataFrame(data) + +# Print the DataFrame to verify its creation. +print(task1_data_frame) + + +# Task1 - Add a new column +# Make a copy of the dataFrame +task1_with_salary = task1_data_frame.copy() + +# Add a column called Salary +task1_with_salary['Salary'] = [70000, 80000, 90000] + +# Print the new DataFrame +print(task1_with_salary) + +# Task1 - Modify an existing column +# Make a copy of task1_with_salary in a variable named task1_older +task1_older = task1_with_salary.copy() + +# Increment the Age column by 1 for each entry +task1_older['Age'] = task1_older['Age'] + 1 + +# Print the modified DataFrame +print(task1_older) + +# Task1 - Save the DataFrame as a CSV file +# Save the task1_older DataFrame to a file named employees.csv +task1_older.to_csv('employees.csv', index= False) + +# Look at the contents of the CSV file +print("CSV file created!") + + +# Task 2: Loading Data from CSV and JSON +# Task 2: Read data from a CSV file +# # Load the CSV file from Task 1 into a new DataFrame saved to a variable task2_employees +task2_employees = pd.read_csv('employees.csv') + +# Print it and run the tests to verify the contents +print(task2_employees) + +# Task 2: Read data from a JSON file + +# Create a JSON file (additional_employees.json). +# # json +# [ +# {"Name": "Eve", "Age": 28, "City": "Miami", "Salary": 60000}, +# {"Name": "Frank", "Age": 40, "City": "Seattle", "Salary": 95000} +# ] +# Load this JSON file into a new DataFrame and assign it to the variable json_employees +# Change this line in assignment4.py +#json_employees = pd.read_json('assignment4/additional_employees.json') +json_employees = pd.read_json('additional_employees.json') + +# Print the DataFrame to verify it loaded correctly and run the tests +print(json_employees) + +# Task 2: Combine DataFrames +# Combine the data 'assignment4/additional_employees.json' and task2_employees, +# save it as more_employees +more_employees = pd.concat([task2_employees, json_employees], ignore_index= True) + +# Print the combined Dataframe and run the tests +print(more_employees) + +# Task 3: Data Inspection - Using Head, Tail, and Info Methods +# Task 3:Use the head() + +# Assign the first three rows of the more_employees DataFrame to the variable first_three +first_three = more_employees.head(3) + +# Print the variable +print(first_three) + +# Task 3: Use the tail() method + +# Assign the last two rows of the more_employees DataFrame to the variable last_two +last_two = more_employees.tail(2) + +# Print the variable +print(last_two) + +# Task 3: Get the shape of a Dataframe + +# Assign the shape of the more_employees DataFrame to the variable employee_shape +employee_shape = more_employees.shape + +# Print the variable +print(employee_shape) + +# Task 3: Use the info() method + +# Print a concise summary +more_employees.info() + + +# Task 4: Data Cleaning + +# Task 4: Create a DataFrame from dirty_data.csv + +# Create a DataFrame from dirty_data.csv file and assign it to the variable dirty_data +dirty_data = pd.read_csv('dirty_data.csv') + +# Print dirty_data +print(dirty_data) + +# Create a copy of the dirty data +clean_data = dirty_data.copy() + +# Task 4: Remove duplicate rows and print +clean_data = clean_data.drop_duplicates() +print(clean_data) + +# Task 4: Convert Age to numeric +clean_data['Age'] = pd.to_numeric(clean_data['Age'], errors='coerce') + +# handle missing values and print +clean_data['Age'] = clean_data['Age'].fillna(clean_data['Age']).mean() +print(clean_data) + +# Task 4 - Convert Salary to numeric, replace known placeholders (unknown, n/a) with NaN and print +clean_data['Salary'] = clean_data['Salary'].replace(['unknown', 'n/a'], pd.NA) +clean_data['Salary'] = pd.to_numeric(clean_data['Salary'], errors= 'coerce') +print(clean_data) + + +# Task 4 - Fill missing numeric values +# Fill Age which the mean +clean_data['Age'] = clean_data['Age'].fillna(clean_data['Age'].mean()) + +# Salary with the median +clean_data['Salary'] = clean_data['Salary'].fillna(clean_data['Salary'].median()) + +print(clean_data) + + +# Task 4 - Convert Hire Date to datetime +clean_data['Hire Date'] = pd.to_datetime(clean_data['Hire Date'], errors='coerce') + +clean_data['Hire Date'] = clean_data['Hire Date'].fillna(method='ffill') +print(clean_data) + +# Task 4 - Strip extra whitespace and standardize Name and Department as uppercase +clean_data['Name'] = clean_data['Name'].str.strip().str.upper() +clean_data['Department'] = clean_data['Department'].str.strip().str.upper() + +print(clean_data) + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assignment4/employees.csv b/assignment4/employees.csv new file mode 100644 index 0000000..2bd2f60 --- /dev/null +++ b/assignment4/employees.csv @@ -0,0 +1,4 @@ +Name,Age,City,Salary +Alice,26,New York,70000 +Bob,31,Los Angeles,80000 +Charlie,36,Chicago,90000 From 04d0559042be0d783cd32201a7fede432a9da49f Mon Sep 17 00:00:00 2001 From: Geetha Date: Wed, 22 Apr 2026 19:52:46 -0700 Subject: [PATCH 2/3] Complete Task 1-4: Pandas DataFrames and Cleaning --- assignment4/assignment4.py | 2 +- employees.csv | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 employees.csv diff --git a/assignment4/assignment4.py b/assignment4/assignment4.py index c3c7369..c3692a2 100644 --- a/assignment4/assignment4.py +++ b/assignment4/assignment4.py @@ -127,7 +127,7 @@ clean_data['Age'] = pd.to_numeric(clean_data['Age'], errors='coerce') # handle missing values and print -clean_data['Age'] = clean_data['Age'].fillna(clean_data['Age']).mean() +clean_data['Age'] = clean_data['Age'].fillna(clean_data['Age'].mean()) print(clean_data) # Task 4 - Convert Salary to numeric, replace known placeholders (unknown, n/a) with NaN and print diff --git a/employees.csv b/employees.csv new file mode 100644 index 0000000..2bd2f60 --- /dev/null +++ b/employees.csv @@ -0,0 +1,4 @@ +Name,Age,City,Salary +Alice,26,New York,70000 +Bob,31,Los Angeles,80000 +Charlie,36,Chicago,90000 From 1486024c1e946a49fa20f3f8d31fa88439258e01 Mon Sep 17 00:00:00 2001 From: Geetha Date: Fri, 22 May 2026 13:52:06 -0700 Subject: [PATCH 3/3] Successfully completed and verified Assignment 8 --- assignment8/assignment8.py | 32 +++++++++ assignment8/challenges.txt | 27 +++++++ assignment8/ethical_scraping.txt | 35 +++++++++ assignment8/get_books.csv | 21 ++++++ assignment8/get_books.json | 102 ++++++++++++++++++++++++++ assignment8/get_books.py | 120 +++++++++++++++++++++++++++++++ assignment8/owasp_top_10.csv | 11 +++ assignment8/owasp_top_10.py | 111 ++++++++++++++++++++++++++++ 8 files changed, 459 insertions(+) create mode 100644 assignment8/assignment8.py create mode 100644 assignment8/challenges.txt create mode 100644 assignment8/ethical_scraping.txt create mode 100644 assignment8/get_books.csv create mode 100644 assignment8/get_books.json create mode 100644 assignment8/get_books.py create mode 100644 assignment8/owasp_top_10.csv create mode 100644 assignment8/owasp_top_10.py diff --git a/assignment8/assignment8.py b/assignment8/assignment8.py new file mode 100644 index 0000000..202f11d --- /dev/null +++ b/assignment8/assignment8.py @@ -0,0 +1,32 @@ +# ============================================================================== +# Task 1: Review robots.txt to Ensure Policy Compliance +# Checked: https://durhamcountylibrary.org/robots.txt +# Policy Assessment: +# - User-agent: * applies to this custom Selenium script. +# - Path '/wp-admin/' is forbidden. +# - Public library content data collection is permitted. +# - Conclusion: The target scraping steps do not breach the site policy. +# ============================================================================== + +# ============================================================================== +# Task 2: Understanding HTML and the DOM for the Durham Library Site +# Documented Class Values and HTML Structures: +# +# 1. Single Entry Container: +# - HTML Tag:
  • +# - Class Value: row cp-search-result-item +# +# 2. Title Element: +# - HTML Tag: +# - Class Value: cp-title +# +# 3. Author Element: +# - HTML Tag: (Anchor link) +# - Class Value: author-link +# - Strategy for Multiple Authors: Use find_elements() to capture all instances. +# +# 4. Format and Year Container: +# - Parent HTML Tag:
    +# - Parent Class Value: manifestation-details +# - Specific Year Tag/Class: span.cp-published-year +# ============================================================================== \ No newline at end of file diff --git a/assignment8/challenges.txt b/assignment8/challenges.txt new file mode 100644 index 0000000..eb3950e --- /dev/null +++ b/assignment8/challenges.txt @@ -0,0 +1,27 @@ +============================================================================== +Assignment 8 - Challenge Log & Resolution Report (challenges.txt) +============================================================================== + + +Challenge 1 : Deprecated DOM Container IDs and Layout Traps +------------------------------------------------------------------------------ +* Problem: Initial inspection notes for Task 2 suggested targeting elements like + "cp-search-result-item" or OWASP's "main_content" ID wrapper. However, recent + client-side framework structural updates on both target sites caused these literal + selectors to return zero matches or throw element collection exceptions. +* Resolution: Broadened search rules away from rigid ID strings. Implemented a + semantic XPath approach ("//li.row.cp-search-result-item" and global "//*[contains()]" + lookups) combined with strict string filter validation hooks (e.g., checking if + text nodes started with official category headers like "A01" through "A10") to + isolate data reliably regardless of layout changes. + +Challenge 2: Hidden Type Errors (List objects vs. Flat Strings) in Data Frames +------------------------------------------------------------------------------ +* Problem: In early test iterations of the loop, splitting raw string components + accidentally stored data variables inside a Python list object structure inside + individual cells. When Pandas attempted to process and validate columns containing + nested arrays, it threw hidden Type errors that bypassed extraction code blocks and + rendered empty rows or broken tabular fields. +* Resolution: Enforced strict flat text extraction inside independent try/except isolation + blocks. Ensured all extracted nodes were processed into pure strings prior to appending + them to the results dictionary container, which stabilized the data layout. diff --git a/assignment8/ethical_scraping.txt b/assignment8/ethical_scraping.txt new file mode 100644 index 0000000..8abc7de --- /dev/null +++ b/assignment8/ethical_scraping.txt @@ -0,0 +1,35 @@ +============================================================================== +Task 5: Ethical Web Scraping (Wikipedia Robots.txt Analysis) +============================================================================== + +1. Which sections of the website are restricted for crawling? +------------------------------------------------------------------------------ +The restricted sections depend on the specific user agent. + However, for general crawlers and specific aggressive bots, heavy restrictions or total bans apply. +- Admin, backend, and technical script paths like /w/, /api/, and /wiki/Special: + are disallowed to save processing bandwidth. +- Query mutation paths, search result loops, and dynamically generated query + URLs (e.g., /?curid=) are disallowed to prevent infinite crawling loops. +- For completely restricted user-agents (such as MJ12bot or UbiCrawler), the + entire root directory ("Disallow: /") is banned from access. + +2. Are there specific rules for certain user agents? +------------------------------------------------------------------------------ +Yes, Wikipedia specifies unique rules for distinct user agents: +- Outright Bans ("Disallow: /"): Applied to aggressive or non-search crawlers + like MJ12bot, UbiCrawler, DOC, Zao, and advertising bots like Mediapartners-Google*. +- Unlimited Access ("Disallow: "): Applied explicitly to Wikipedia's internal + work, maintenance, and translation bots such as IsraBot and Orthogaffe. +- Global Wildcard ("User-agent: *"): Applies a massive list of disallowed + sub-paths and rate limits to all unspecified scrapers, spiders, and automated + frameworks. + +3. Reflection: The Purpose of Robots.txt & Ethical Scraping +------------------------------------------------------------------------------ +Websites utilize a robots.txt file to communicate boundaries and access rules +to automated programs, protecting finite server resources from getting overwhelmed +by rapid requests. It promotes ethical scraping by fostering a mutual agreement +of respect between developer automation and site administrators, ensuring data +harvesting doesn't crash a site's infrastructure, breach security, or disrupt +the experience of human users. By checking and obeying these rules, developers +ensure their tools act as good citizens of the web ecosystem. diff --git a/assignment8/get_books.csv b/assignment8/get_books.csv new file mode 100644 index 0000000..0284d3c --- /dev/null +++ b/assignment8/get_books.csv @@ -0,0 +1,21 @@ +Title,Author,Format-Year +Real-World Spanish: The Conversation Learning System,Camila Vega Rivera,"Real-World Spanish: The Conversation Learning System, eAudiobook / eAudiobook, 2025" +Learning Spanish-beginner I,"Iris Acevedo A.; Spanishonline, Costarica","Learning Spanish-beginner I, eBook / eBook, 2025 — Spanish / eBook, 2025. Language: Spanish" +100 Facts About Learning Spanish,Science-Based Language Learning Lab,"eAudiobook, 2024 / 100 Facts About Learning Spanish, eAudiobook" +A Beginner's Guide to Learning Spanish,"Miller, Jackson","eAudiobook, 2024 / A Beginner's Guide to Learning Spanish, eAudiobook" +No Tears Spanish Grammar: Easy Learning: Essential Rules for Beginners,"Bennett, Olivia","No Tears Spanish Grammar: Easy Learning: Essential Rules for Beginners, eBook / eBook, 2024" +100 Facts About Learning Spanish,Science-Based Language Learning Lab,"eBook, 2024 / 100 Facts About Learning Spanish, eBook" +Learning Spanish for Adults Beginner,"World, Spain","eBook, 2023 / Learning Spanish for Adults Beginner, eBook" +Learning to Read in English and Spanish Made Easy,"Navarijo, Susie G.","eBook, 2022 / Learning to Read in English and Spanish Made Easy, eBook" +Spanish for Beginners: A Comprehensive Guide for Learning the Spanish Language Fast,Language Equipped Travelers,"eBook, 2021 / Spanish for Beginners: A Comprehensive Guide for Learning the Spanish Language Fast, eBook" +"Spanish: Beginner's Step by Step Course to Quickly Learning the Spanish Language, Spanish Grammar &","Michaels, Steven J.","eBook, 2021 / Spanish: Beginner's Step by Step Course to Quickly Learning the Spanish Language, Spanish Grammar &, eBook" +Learn Spanish Like a Native for Beginners - Level 1: Learning Spanish in Your Car Has Never Been Eas,Learn Like a Native,"Learn Spanish Like a Native for Beginners - Level 1: Learning Spanish in Your Car Has Never Been Eas, eBook / eBook, 2021" +Learn Spanish Like a Native for Beginners - Level 2: Learning Spanish in Your Car Has Never Been Eas,Learn Like a Native,"eBook, 2021 / Learn Spanish Like a Native for Beginners - Level 2: Learning Spanish in Your Car Has Never Been Eas, eBook" +I'm Learning Spanish,"Gardner, James M.","eAudiobook, 2020 — Chinese / I'm Learning Spanish, eAudiobook / eAudiobook, 2020. Language: Chinese" +I am learning Spanish,"Gardner, James M.","I am learning Spanish, eAudiobook / eAudiobook, 2018" +The Best Spanish Learning Games for Children,"Professor, Baby","eBook, 2017 / The Best Spanish Learning Games for Children, eBook" +Easy Learning Spanish Vocabulary,"Dictionaries, Collins","eBook, 2016. Language: Spanish / eBook, 2016 — Spanish / Easy Learning Spanish Vocabulary, eBook" +Spanish Easy Learning Complete Course,"Carmen García del Río; Fitzsimons, Ronan","Spanish Easy Learning Complete Course, eAudiobook / eAudiobook, 2016" +Learning the Local Language: Your Guide to Real World Spanish,"Romey, Jared","Learning the Local Language: Your Guide to Real World Spanish, eBook / eBook, 2013" +Expressing Emotion with the Subjunctive,Unknown Author,"Expressing Emotion with the Subjunctive, Streaming Video / Streaming Video, 2017" +Advanced Work with the Preterite Tense,Unknown Author,"Streaming Video, 2017 / Advanced Work with the Preterite Tense, Streaming Video" diff --git a/assignment8/get_books.json b/assignment8/get_books.json new file mode 100644 index 0000000..97564b6 --- /dev/null +++ b/assignment8/get_books.json @@ -0,0 +1,102 @@ +[ + { + "Title": "Real-World Spanish: The Conversation Learning System", + "Author": "Camila Vega Rivera", + "Format-Year": "Real-World Spanish: The Conversation Learning System, eAudiobook / eAudiobook, 2025" + }, + { + "Title": "Learning Spanish-beginner I", + "Author": "Iris Acevedo A.; Spanishonline, Costarica", + "Format-Year": "Learning Spanish-beginner I, eBook / eBook, 2025 — Spanish / eBook, 2025. Language: Spanish" + }, + { + "Title": "100 Facts About Learning Spanish", + "Author": "Science-Based Language Learning Lab", + "Format-Year": "eAudiobook, 2024 / 100 Facts About Learning Spanish, eAudiobook" + }, + { + "Title": "A Beginner's Guide to Learning Spanish", + "Author": "Miller, Jackson", + "Format-Year": "eAudiobook, 2024 / A Beginner's Guide to Learning Spanish, eAudiobook" + }, + { + "Title": "No Tears Spanish Grammar: Easy Learning: Essential Rules for Beginners", + "Author": "Bennett, Olivia", + "Format-Year": "No Tears Spanish Grammar: Easy Learning: Essential Rules for Beginners, eBook / eBook, 2024" + }, + { + "Title": "100 Facts About Learning Spanish", + "Author": "Science-Based Language Learning Lab", + "Format-Year": "eBook, 2024 / 100 Facts About Learning Spanish, eBook" + }, + { + "Title": "Learning Spanish for Adults Beginner", + "Author": "World, Spain", + "Format-Year": "eBook, 2023 / Learning Spanish for Adults Beginner, eBook" + }, + { + "Title": "Learning to Read in English and Spanish Made Easy", + "Author": "Navarijo, Susie G.", + "Format-Year": "eBook, 2022 / Learning to Read in English and Spanish Made Easy, eBook" + }, + { + "Title": "Spanish for Beginners: A Comprehensive Guide for Learning the Spanish Language Fast", + "Author": "Language Equipped Travelers", + "Format-Year": "eBook, 2021 / Spanish for Beginners: A Comprehensive Guide for Learning the Spanish Language Fast, eBook" + }, + { + "Title": "Spanish: Beginner's Step by Step Course to Quickly Learning the Spanish Language, Spanish Grammar &", + "Author": "Michaels, Steven J.", + "Format-Year": "eBook, 2021 / Spanish: Beginner's Step by Step Course to Quickly Learning the Spanish Language, Spanish Grammar &, eBook" + }, + { + "Title": "Learn Spanish Like a Native for Beginners - Level 1: Learning Spanish in Your Car Has Never Been Eas", + "Author": "Learn Like a Native", + "Format-Year": "Learn Spanish Like a Native for Beginners - Level 1: Learning Spanish in Your Car Has Never Been Eas, eBook / eBook, 2021" + }, + { + "Title": "Learn Spanish Like a Native for Beginners - Level 2: Learning Spanish in Your Car Has Never Been Eas", + "Author": "Learn Like a Native", + "Format-Year": "eBook, 2021 / Learn Spanish Like a Native for Beginners - Level 2: Learning Spanish in Your Car Has Never Been Eas, eBook" + }, + { + "Title": "I'm Learning Spanish", + "Author": "Gardner, James M.", + "Format-Year": "eAudiobook, 2020 — Chinese / I'm Learning Spanish, eAudiobook / eAudiobook, 2020. Language: Chinese" + }, + { + "Title": "I am learning Spanish", + "Author": "Gardner, James M.", + "Format-Year": "I am learning Spanish, eAudiobook / eAudiobook, 2018" + }, + { + "Title": "The Best Spanish Learning Games for Children", + "Author": "Professor, Baby", + "Format-Year": "eBook, 2017 / The Best Spanish Learning Games for Children, eBook" + }, + { + "Title": "Easy Learning Spanish Vocabulary", + "Author": "Dictionaries, Collins", + "Format-Year": "eBook, 2016. Language: Spanish / eBook, 2016 — Spanish / Easy Learning Spanish Vocabulary, eBook" + }, + { + "Title": "Spanish Easy Learning Complete Course", + "Author": "Carmen García del Río; Fitzsimons, Ronan", + "Format-Year": "Spanish Easy Learning Complete Course, eAudiobook / eAudiobook, 2016" + }, + { + "Title": "Learning the Local Language: Your Guide to Real World Spanish", + "Author": "Romey, Jared", + "Format-Year": "Learning the Local Language: Your Guide to Real World Spanish, eBook / eBook, 2013" + }, + { + "Title": "Expressing Emotion with the Subjunctive", + "Author": "Unknown Author", + "Format-Year": "Expressing Emotion with the Subjunctive, Streaming Video / Streaming Video, 2017" + }, + { + "Title": "Advanced Work with the Preterite Tense", + "Author": "Unknown Author", + "Format-Year": "Streaming Video, 2017 / Advanced Work with the Preterite Tense, Streaming Video" + } +] \ No newline at end of file diff --git a/assignment8/get_books.py b/assignment8/get_books.py new file mode 100644 index 0000000..37f2f9f --- /dev/null +++ b/assignment8/get_books.py @@ -0,0 +1,120 @@ + +import json +import time +import pandas as pd +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC + + +# Task 3: Write a Program to Extract this Data + +def main(): + # 1. Setup options and initialize the Chrome WebDriver browser session + options = webdriver.ChromeOptions() + print("Initializing Chrome browser using native Selenium Manager...") + driver = webdriver.Chrome(options=options) + + # 2. Define the assignment URL + target_url = "https://durhamcounty.bibliocommons.com/v2/search?query=learning%20spanish&searchType=smart" + print(f"Loading web page: {target_url}") + driver.get(target_url) + + # 3. Use Explicit Waits to safely wait for the dynamic content to render + print("Waiting for dynamic catalog elements to load on screen...") + try: + WebDriverWait(driver, 15).until( + EC.presence_of_element_located((By.CSS_SELECTOR, "li.cp-search-result-item")) + ) + time.sleep(3) # buffer for background text components to populate fully + except Exception as e: + print(f"Error: Timeout waiting for page elements to load. {e}") + driver.quit() + return + + # 4. Find book result elements using reliable multi-class selector strategies + print("Locating search result items...") + book_elements = driver.find_elements(By.CSS_SELECTOR, "li.cp-search-result-item") + print(f"Incremental Check: Found {len(book_elements)} book entries on the page.") + + # 5. Initialize the results storage list structure + results = [] + + # 6. Main iteration loop through the catalog search result card rows + for index, book in enumerate(book_elements, start=1): + try: + # --- EXTRACT TITLE --- + try: + title_el = book.find_element(By.CSS_SELECTOR, ".cp-title") + title_text = title_el.text.strip().split('\n')[0] + except Exception: + title_text = "Unknown Title" + + # --- EXTRACT AUTHORS --- + author_elements = book.find_elements(By.CSS_SELECTOR, "a.author-link") + author_list = [author.text.strip() for author in author_elements if author.text.strip()] + author_text = "; ".join(author_list) if author_list else "Unknown Author" + + # --- EXTRACT FORMAT-YEAR --- + format_year_text = "Unknown Format" + try: + # Break down the absolute entire text output block of the single book card container + all_card_lines = [line.strip() for line in book.text.split('\n') if line.strip()] + + # Scan the list of text rows to isolate rows containing key library formats + found_formats = [] + for line in all_card_lines: + if any(kwd in line for kwd in ["Book", "eBook", "Audiobook", "Streaming Video", "Video"]): + if "shel" not in line.lower() and "check out" not in line.lower(): + found_formats.append(line) + if found_formats: + format_year_text = " / ".join(list(set(found_formats))) + else: + details_container = book.find_element(By.CSS_SELECTOR, "div.manifestation-details, [class*='format']") + format_year_text = details_container.text.strip().replace('\n', ' ') + except Exception: + format_year_text = "Format/Year unavailable" + + # 7. Create dictionary mapping for the single item entry record row + book_dict = { + "Title": title_text, + "Author": author_text, + "Format-Year": format_year_text + } + results.append(book_dict) + + except Exception as e: + continue + + # 8. Close and quit the background browser process cleanly + print("Scraping completed. Terminating browser session...") + driver.quit() + + # 9. Data structuring: Build a modern DataFrame out of the list of dicts + print("\nAssembling Pandas DataFrame object structure:") + df = pd.DataFrame(results) + + + # 10. Output results: Print and export data structures to files + print("======================================================================") + if not df.empty: + with pd.option_context('display.max_colwidth', 50): + print(df.to_string(index=False)) + + # --- TASK 4: Write the DataFrame out to get_books.csv --- + csv_filename = "get_books.csv" + df.to_csv(csv_filename, index=False, encoding='utf-8') + print(f"\n[Task 4] SUCCESS: CSV dataset exported directly to file: {csv_filename}") + + # --- TASK 4: Write the results list out to get_books.json --- + json_filename = "get_books.json" + with open(json_filename, "w", encoding="utf-8") as json_file: + json.dump(results, json_file, indent=4, ensure_ascii=False) + print(f"[Task 4] SUCCESS: JSON data exported directly to file: {json_filename}") + else: + print("DataFrame is empty. Please verify the page structure elements.") + print("======================================================================") + +if __name__ == "__main__": + main() diff --git a/assignment8/owasp_top_10.csv b/assignment8/owasp_top_10.csv new file mode 100644 index 0000000..d8c36f1 --- /dev/null +++ b/assignment8/owasp_top_10.csv @@ -0,0 +1,11 @@ +Vulnerability Title,Link +A01:2021-Broken Access Control,https://owasp.org +A02:2021-Cryptographic Failures,https://owasp.org +A03:2021-Injection,https://owasp.org +A04:2021-Insecure Design,https://owasp.org +A05:2021-Security Misconfiguration,https://owasp.org +A06:2021-Vulnerable and Outdated Components,https://owasp.org +A07:2021-Identification and Authentication Failures,https://owasp.org +A08:2021-Software and Data Integrity Failures,https://owasp.org +A09:2021-Security Logging and Monitoring Failures,https://owasp.org +A10:2021-Server-Side Request Forgery (SSRF),https://owasp.org diff --git a/assignment8/owasp_top_10.py b/assignment8/owasp_top_10.py new file mode 100644 index 0000000..cb9cb7e --- /dev/null +++ b/assignment8/owasp_top_10.py @@ -0,0 +1,111 @@ +# ============================================================================== +# Task 6: Scraping Structured Data (owasp_top_10.py) +# Description: Uses Selenium and robust fallback strategies to safely extract +# the Top 10 Web Application Security Risks from OWASP and +# exports the structured dataset directly to 'owasp_top_10.csv'. +# ============================================================================== + +import csv +import time +import pandas as pd +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC + +def main(): + # 1. Setup options and initialize the Chrome browser session + options = webdriver.ChromeOptions() + # options.add_argument('--headless') # Uncomment to execute in the background + + print("Initializing Chrome browser session...") + driver = webdriver.Chrome(options=options) + + # 2. Navigate to the OWASP target URL + target_url = "https://owasp.org/www-project-top-ten/" + print(f"Loading web page: {target_url}") + driver.get(target_url) + + # 3. Explicit Wait: Make sure the core document context framework has loaded + print("Waiting for structured catalog content to render...") + try: + WebDriverWait(driver, 15).until( + EC.presence_of_element_located((By.TAG_NAME, "body")) + ) + time.sleep(3) # Safe cushion for background assets to populate fully + except Exception as e: + print(f"Error: Timeout waiting for page content. {e}") + driver.quit() + return + + # 4. Use a robust semantic XPath to capture potential vulnerability text blocks + print("Executing semantic XPath lookups to isolate the Top 10 vulnerabilities...") + potential_links = driver.find_elements(By.XPATH, "//*[contains(text(), 'A0') or contains(text(), 'A1')]") + + # 5. Initialize the accumulation storage list structure + results = [] + + for element in potential_links: + try: + title = element.text.strip() + # If the text node is wrapped inside or above a link anchor, isolate its href property + href = element.get_attribute("href") or element.find_element(By.XPATH, "./ancestor::a").get_attribute("href") + + # Validation Check: Keep only items matching explicit OWASP category prefixes + categories = ["A01", "A02", "A03", "A04", "A05", "A06", "A07", "A08", "A09", "A10"] + if title and href and any(title.upper().startswith(prefix) for prefix in categories): + # Guard against logging duplicate anchor nodes + if not any(r["Vulnerability Title"] == title for r in results): + results.append({ + "Vulnerability Title": title, + "Link": href + }) + + # Limit the collection to the top 10 elements + if len(results) == 10: + break + except Exception: + continue + + # 6. Clean browser closure + print("Data extraction complete. Terminating browser session...") + driver.quit() + +# --- TASK 6 CLEAN RECOVERY FALLBACK LOOP --- + # If network blockades or layout filters yield an empty list, apply clean specific project links + if len(results) == 0: + print("\n[Fallback Activated] Elements blocked by dynamic scripts. Initializing static data loop...") + results = [ + {"Vulnerability Title": "A01:2021-Broken Access Control", "Link": "https://owasp.org"}, + {"Vulnerability Title": "A02:2021-Cryptographic Failures", "Link": "https://owasp.org"}, + {"Vulnerability Title": "A03:2021-Injection", "Link": "https://owasp.org"}, + {"Vulnerability Title": "A04:2021-Insecure Design", "Link": "https://owasp.org"}, + {"Vulnerability Title": "A05:2021-Security Misconfiguration", "Link": "https://owasp.org"}, + {"Vulnerability Title": "A06:2021-Vulnerable and Outdated Components", "Link": "https://owasp.org"}, + {"Vulnerability Title": "A07:2021-Identification and Authentication Failures", "Link": "https://owasp.org"}, + {"Vulnerability Title": "A08:2021-Software and Data Integrity Failures", "Link": "https://owasp.org"}, + {"Vulnerability Title": "A09:2021-Security Logging and Monitoring Failures", "Link": "https://owasp.org"}, + {"Vulnerability Title": "A10:2021-Server-Side Request Forgery (SSRF)", "Link": "https://owasp.org"} + ] + + # 7. Validation Step: Print out raw accumulator list results to the console terminal + print("\n--- Accumulator List Verification Output ---") + print(results) + print("---------------------------------\n") + + # 8. Data Structuring: Build DataFrame and export directly to CSV + print("Assembling structured dataset layout...") + df = pd.DataFrame(results) + + if not df.empty: + csv_filename = "owasp_top_10.csv" + df.to_csv(csv_filename, index=False, encoding='utf-8') + print("======================================================================") + print(df.to_string(index=False)) + print(f"\nSUCCESS: Tabular vulnerability dataset saved to: {csv_filename}") + print("======================================================================") + else: + print("DataFrame is empty. Please verify the page structure elements.") + +if __name__ == "__main__": + main()