diff --git a/assignment4/additional_employees.json b/assignment4/additional_employees.json new file mode 100644 index 0000000..8deff6e --- /dev/null +++ b/assignment4/additional_employees.json @@ -0,0 +1,14 @@ +[ + { + "Name": "Eve", + "Age": 28, + "City": "Miami", + "Salary": 60000 + }, + { + "Name": "Frank", + "Age": 40, + "City": "Seattle", + "Salary": 95000 + } +] diff --git a/assignment4/assignment4.py b/assignment4/assignment4.py index e69de29..509df6f 100644 --- a/assignment4/assignment4.py +++ b/assignment4/assignment4.py @@ -0,0 +1,64 @@ +import pandas as pd + +# Task 1 +data = { + "Name": ['Alice', 'Bob', 'Charlie'], + "Age": [25, 30, 35], + "City": ['New York', 'Los Angeles', 'Chicago'] +} + +task1_data_frame = pd.DataFrame(data) +print(task1_data_frame) + +task1_with_salary = task1_data_frame.copy() +task1_with_salary['Salary'] = [70000, 80000, 90000] +print(task1_with_salary) + +task1_older = task1_with_salary.copy() +task1_older["Age"] = task1_older["Age"] + 1 +print(task1_older) + +task1_older.to_csv("employees.csv", index=False) + +df = pd.read_csv("employees.csv") +print(df) + +# Task 2 +task2_employees = pd.read_csv("employees.csv") +print(task2_employees) +json_employees = pd.read_json("additional_employees.json") +print(json_employees) +more_employees = pd.concat([task2_employees, json_employees], ignore_index=True) +print(more_employees) + +first_three = more_employees.head(3) +print(first_three) + +last_two = more_employees.tail(2) +print(last_two) + +employee_shape = more_employees.shape +print(employee_shape) + +more_employees.info() + +# Task 4 +dirty_data = pd.read_csv("dirty_data.csv") + +clean_data = dirty_data.copy() +clean_data = clean_data.drop_duplicates() + +clean_data["Age"] = pd.to_numeric(clean_data["Age"], errors="coerce") +clean_data["Salary"] = pd.to_numeric(clean_data["Salary"], errors="coerce") + +clean_data["Age"] = clean_data["Age"].fillna(clean_data["Age"].mean()) +clean_data["Salary"] = clean_data["Salary"].fillna(clean_data["Salary"].median()) + +clean_data["Hire Date"] = pd.to_datetime(clean_data["Hire Date"], errors="coerce") +clean_data["Hire Date"] = clean_data["Hire Date"].fillna(pd.Timestamp("2000-01-01")) +print(clean_data) + +clean_data["Name"] = clean_data["Name"].str.strip().str.upper() +clean_data["Department"] = clean_data["Department"].str.strip().str.upper() + +print(clean_data) \ No newline at end of file diff --git a/assignment4/employees.csv b/assignment4/employees.csv new file mode 100644 index 0000000..2bd2f60 --- /dev/null +++ b/assignment4/employees.csv @@ -0,0 +1,4 @@ +Name,Age,City,Salary +Alice,26,New York,70000 +Bob,31,Los Angeles,80000 +Charlie,36,Chicago,90000 diff --git a/assignment8/challenges.txt b/assignment8/challenges.txt new file mode 100644 index 0000000..9159338 --- /dev/null +++ b/assignment8/challenges.txt @@ -0,0 +1,4 @@ +Challenges faced and how I resolved them: +- I needed to find the correct page structure for the OWASP Top Ten list. +- The landing page URL did not show the top 10 items directly, so I followed the OWASP Top Ten 2025 link and extracted the actual list there. +- I kept the code simple with short comments so it is easy to read and understand. diff --git a/assignment8/durham_robots.txt b/assignment8/durham_robots.txt new file mode 100644 index 0000000..6f57034 --- /dev/null +++ b/assignment8/durham_robots.txt @@ -0,0 +1,14 @@ +Durham County Library robots.txt Review +======================================== +URL reviewed: https://durhamcountylibrary.org/robots.txt + +Disallowed paths: +- Lists any restricted paths you saw (like /admin, /private etc.) + +Allowed paths: +- Search results pages are not restricted + +Compliance confirmation: +- Our scraping targets the public search results page only +- We are not accessing any disallowed paths +- We are not scraping at a fast rate that would abuse the server \ No newline at end of file diff --git a/assignment8/ethical_scraping.txt b/assignment8/ethical_scraping.txt new file mode 100644 index 0000000..3b446c2 --- /dev/null +++ b/assignment8/ethical_scraping.txt @@ -0,0 +1,44 @@ +Question 1: Which sections of the website are restricted for crawling? + +The following URL paths are disallowed in Wikipedia's robots.txt: + +Disallow: /wiki/MediaWiki:Spam-blacklist +Disallow: /wiki/MediaWiki_talk:Spam-blacklist +Disallow: /wiki/Wikipedia:WikiProject_Spam +Disallow: /wiki/Wikipedia_talk:WikiProject_Spam +Disallow: /wiki/Wikipedia:Articles_for_deletion +Disallow: /wiki/Wikipedia:Article_Incubator +Disallow: /wiki/Wikipedia_talk:Article_Incubator +Disallow: /wiki/Category:Noindexed_pages +Disallow: /wiki/Module:Sandbox +Disallow: /wiki/Template:TemplateStyles_sandbox +Disallow: /wiki/Wikipedia:Administrator_recall +Disallow: /wiki/Wikipedia_talk:Administrator_recall +Disallow: /wiki/Wikipedia:Administrator_elections +Disallow: /wiki/Wikipedia_talk:Administrator_elections +Disallow: /wiki/Wikipedia:Requests_for_comment/ + +Question 2: Are there specific rules for certain user agents? + +Yes. Wikipedia's robots.txt has rules for specific user agents. +Certain bots are completely blocked with Disallow: / including: +- sitecheck.internetseer.com +- Zealbot +- MSIECrawler +- SiteSnagger +- WebStripper +- WebCopier +- HTTrack +- larbin +- libwww +- fast (blocked for requesting too fast) +- wget (warned about recursive mode) + +Question 3: Why do websites use robots.txt? + +Websites use robots.txt to communicate crawling policies to web robots +and scrapers. It helps protect server resources by preventing bots from +making too many requests too quickly, which could cause a denial-of-service +effect. It also allows sites to keep certain pages like spam lists, +deletion discussions, and sandbox pages out of search engine indexes, +protecting both the site's integrity and its users' privacy. \ No newline at end of file diff --git a/assignment8/get_books.csv b/assignment8/get_books.csv new file mode 100644 index 0000000..1e0ba5e --- /dev/null +++ b/assignment8/get_books.csv @@ -0,0 +1,21 @@ +Title,Author,Format-Year +Spanish Romance Stories for Language Learning,"Rowett, Mary","eAudiobook, 2026" +Learning Spanish-beginner I,"Iris Acevedo A.; Spanishonline, Costarica","eBook, 2025 — Spanish" +Real-World Spanish: The Conversation Learning System,Camila Vega Rivera,"eAudiobook, 2025" +100 Facts About Learning Spanish,Science-Based Language Learning Lab,"eAudiobook, 2024" +A Beginner's Guide to Learning Spanish,"Miller, Jackson","eAudiobook, 2024" +No Tears Spanish Grammar: Easy Learning: Essential Rules for Beginners,"Bennett, Olivia","eBook, 2024" +100 Facts About Learning Spanish,Science-Based Language Learning Lab,"eBook, 2024" +The Ultimate Learning Spanish Blueprint - 10 Essential Steps,"Ramirez, Andres","eAudiobook, 2024" +Learning Spanish for Adults Beginner,"World, Spain","eBook, 2023" +Learning to Read in English and Spanish Made Easy,"Navarijo, Susie G.","eBook, 2022" +Spanish for Beginners: A Comprehensive Guide for Learning the Spanish Language Fast,Language Equipped Travelers,"eBook, 2021" +"Spanish: Beginner's Step by Step Course to Quickly Learning the Spanish Language, Spanish Grammar &","Michaels, Steven J.","eBook, 2021" +Learn Spanish Like a Native for Beginners - Level 1: Learning Spanish in Your Car Has Never Been Eas,Learn Like a Native,"eBook, 2021" +Learn Spanish Like a Native for Beginners - Level 2: Learning Spanish in Your Car Has Never Been Eas,Learn Like a Native,"eBook, 2021" +I'm Learning Spanish,"Gardner, James M.","eAudiobook, 2020 — Chinese" +I am learning Spanish,"Gardner, James M.","eAudiobook, 2018" +The Best Spanish Learning Games for Children,"Professor, Baby","eBook, 2017" +Easy Learning Spanish Vocabulary,"Dictionaries, Collins","eBook, 2016 — Spanish" +Spanish Easy Learning Complete Course,"Carmen García del Río; Fitzsimons, Ronan","eAudiobook, 2016" +Learning the Local Language: Your Guide to Real World Spanish,"Romey, Jared","eBook, 2013" diff --git a/assignment8/get_books.json b/assignment8/get_books.json new file mode 100644 index 0000000..c40b4f4 --- /dev/null +++ b/assignment8/get_books.json @@ -0,0 +1,102 @@ +[ + { + "Title": "Spanish Romance Stories for Language Learning", + "Author": "Rowett, Mary", + "Format-Year": "eAudiobook, 2026" + }, + { + "Title": "Learning Spanish-beginner I", + "Author": "Iris Acevedo A.; Spanishonline, Costarica", + "Format-Year": "eBook, 2025 — Spanish" + }, + { + "Title": "Real-World Spanish: The Conversation Learning System", + "Author": "Camila Vega Rivera", + "Format-Year": "eAudiobook, 2025" + }, + { + "Title": "100 Facts About Learning Spanish", + "Author": "Science-Based Language Learning Lab", + "Format-Year": "eAudiobook, 2024" + }, + { + "Title": "A Beginner's Guide to Learning Spanish", + "Author": "Miller, Jackson", + "Format-Year": "eAudiobook, 2024" + }, + { + "Title": "No Tears Spanish Grammar: Easy Learning: Essential Rules for Beginners", + "Author": "Bennett, Olivia", + "Format-Year": "eBook, 2024" + }, + { + "Title": "100 Facts About Learning Spanish", + "Author": "Science-Based Language Learning Lab", + "Format-Year": "eBook, 2024" + }, + { + "Title": "The Ultimate Learning Spanish Blueprint - 10 Essential Steps", + "Author": "Ramirez, Andres", + "Format-Year": "eAudiobook, 2024" + }, + { + "Title": "Learning Spanish for Adults Beginner", + "Author": "World, Spain", + "Format-Year": "eBook, 2023" + }, + { + "Title": "Learning to Read in English and Spanish Made Easy", + "Author": "Navarijo, Susie G.", + "Format-Year": "eBook, 2022" + }, + { + "Title": "Spanish for Beginners: A Comprehensive Guide for Learning the Spanish Language Fast", + "Author": "Language Equipped Travelers", + "Format-Year": "eBook, 2021" + }, + { + "Title": "Spanish: Beginner's Step by Step Course to Quickly Learning the Spanish Language, Spanish Grammar &", + "Author": "Michaels, Steven J.", + "Format-Year": "eBook, 2021" + }, + { + "Title": "Learn Spanish Like a Native for Beginners - Level 1: Learning Spanish in Your Car Has Never Been Eas", + "Author": "Learn Like a Native", + "Format-Year": "eBook, 2021" + }, + { + "Title": "Learn Spanish Like a Native for Beginners - Level 2: Learning Spanish in Your Car Has Never Been Eas", + "Author": "Learn Like a Native", + "Format-Year": "eBook, 2021" + }, + { + "Title": "I'm Learning Spanish", + "Author": "Gardner, James M.", + "Format-Year": "eAudiobook, 2020 — Chinese" + }, + { + "Title": "I am learning Spanish", + "Author": "Gardner, James M.", + "Format-Year": "eAudiobook, 2018" + }, + { + "Title": "The Best Spanish Learning Games for Children", + "Author": "Professor, Baby", + "Format-Year": "eBook, 2017" + }, + { + "Title": "Easy Learning Spanish Vocabulary", + "Author": "Dictionaries, Collins", + "Format-Year": "eBook, 2016 — Spanish" + }, + { + "Title": "Spanish Easy Learning Complete Course", + "Author": "Carmen García del Río; Fitzsimons, Ronan", + "Format-Year": "eAudiobook, 2016" + }, + { + "Title": "Learning the Local Language: Your Guide to Real World Spanish", + "Author": "Romey, Jared", + "Format-Year": "eBook, 2013" + } +] \ No newline at end of file diff --git a/assignment8/get_books.py b/assignment8/get_books.py new file mode 100644 index 0000000..6597aac --- /dev/null +++ b/assignment8/get_books.py @@ -0,0 +1,81 @@ +# HTML/DOM Exploration Notes +# =========================== +# URL scraped: https://durhamcounty.bibliocommons.com/v2/search?query=learning%20spanish&searchType=smart +# +# Search result list item: +# Tag: li +# Class: cp-search-result-item +# +# Title element: +# Tag: span or a +# Class: title-content +# +# Author element: +# Tag: a (link) +# Class: author-link +# Note: multiple authors joined with semicolon ; +# +# Format/Year element: +# Tag: span inside div +# Class: manifestation-item-format-info-wrap +# Note: we grab the first span inside that div + +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from webdriver_manager.chrome import ChromeDriverManager +import pandas as pd +import json +# Task 3: Load the Durham County search page +# Step 1: Load the web page +url = "https://durhamcounty.bibliocommons.com/v2/search?query=learning%20spanish&searchType=smart" +driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) +driver.get(url) + +# Step 2: Wait until result items appear on the page +wait = WebDriverWait(driver, 15) +wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "li.cp-search-result-item"))) + +# Step 3: Find all the li elements +books = driver.find_elements(By.CSS_SELECTOR, "li.cp-search-result-item") + +# Step 4: Create an empty list to store results +results = [] + +# Step 5: Main loop - extract data from each book +for book in books: + # Extract the title + title = book.find_element(By.CSS_SELECTOR, ".title-content").text + + # Extract all author elements and join with semicolons + authors = book.find_elements(By.CSS_SELECTOR, ".author-link") + author_text = "; ".join([author.text for author in authors]) + + # Extract the format and year + format_year = "" + format_elements = book.find_elements(By.CSS_SELECTOR, ".manifestation-item-format-info-wrap span") + if format_elements: + format_year = format_elements[0].text + + # Create a dict and add it to results list + results.append({ + "Title": title, + "Author": author_text, + "Format-Year": format_year, + }) + +# Task 4: Prepare the DataFrame and save files +# Step 1: Create a DataFrame from the results list +df = pd.DataFrame(results) +print(df) + +# Step 2: Write the DataFrame to get_books.csv +df.to_csv("get_books.csv", index=False) + +# Step 3: Write the results list to get_books.json +with open("get_books.json", "w", encoding="utf-8") as f: + json.dump(results, f, indent=2, ensure_ascii=False) + +driver.quit() \ No newline at end of file diff --git a/assignment8/owasp_top_10.csv b/assignment8/owasp_top_10.csv new file mode 100644 index 0000000..1952918 --- /dev/null +++ b/assignment8/owasp_top_10.csv @@ -0,0 +1,11 @@ +Title,Link +A01:2025 - Broken Access Control,https://owasp.org/Top10/2025/A01_2025-Broken_Access_Control/ +A02:2025 - Security Misconfiguration,https://owasp.org/Top10/2025/A02_2025-Security_Misconfiguration/ +A03:2025 - Software Supply Chain Failures,https://owasp.org/Top10/2025/A03_2025-Software_Supply_Chain_Failures/ +A04:2025 - Cryptographic Failures,https://owasp.org/Top10/2025/A04_2025-Cryptographic_Failures/ +A05:2025 - Injection,https://owasp.org/Top10/2025/A05_2025-Injection/ +A06:2025 - Insecure Design,https://owasp.org/Top10/2025/A06_2025-Insecure_Design/ +A07:2025 - Authentication Failures,https://owasp.org/Top10/2025/A07_2025-Authentication_Failures/ +A08:2025 - Software or Data Integrity Failures,https://owasp.org/Top10/2025/A08_2025-Software_or_Data_Integrity_Failures/ +A09:2025 - Security Logging and Alerting Failures,https://owasp.org/Top10/2025/A09_2025-Security_Logging_and_Alerting_Failures/ +A10:2025 - Mishandling of Exceptional Conditions,https://owasp.org/Top10/2025/A10_2025-Mishandling_of_Exceptional_Conditions/ diff --git a/assignment8/owasp_top_10.py b/assignment8/owasp_top_10.py new file mode 100644 index 0000000..11aad31 --- /dev/null +++ b/assignment8/owasp_top_10.py @@ -0,0 +1,55 @@ +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from webdriver_manager.chrome import ChromeDriverManager +import pandas as pd + +# Task 6: Extract OWASP Top 10 vulnerabilities +# Step 1: Load the OWASP project page +base_url = "https://owasp.org/www-project-top-ten/" +driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) +driver.get(base_url) + +wait = WebDriverWait(driver, 15) + +# Step 2: Navigate to the OWASP Top Ten 2025 page +link_2025 = wait.until( + EC.presence_of_element_located((By.LINK_TEXT, "OWASP Top Ten 2025")) +) +page_2025 = link_2025.get_attribute("href") +driver.get(page_2025) +wait.until(EC.presence_of_element_located((By.TAG_NAME, "main"))) + +# Step 3: Find the top 10 vulnerability links using XPath +xpath_expr = ( + "//main//a[starts-with(normalize-space(.), 'A01:2025') or " + "starts-with(normalize-space(.), 'A02:2025') or " + "starts-with(normalize-space(.), 'A03:2025') or " + "starts-with(normalize-space(.), 'A04:2025') or " + "starts-with(normalize-space(.), 'A05:2025') or " + "starts-with(normalize-space(.), 'A06:2025') or " + "starts-with(normalize-space(.), 'A07:2025') or " + "starts-with(normalize-space(.), 'A08:2025') or " + "starts-with(normalize-space(.), 'A09:2025') or " + "starts-with(normalize-space(.), 'A10:2025')]" +) +items = driver.find_elements(By.XPATH, xpath_expr) + +# Step 4: Extract title and link for each vulnerability +results = [] +for item in items: + title = item.text.strip() + href = item.get_attribute("href") + results.append({"Title": title, "Link": href}) + +# Step 5: Save results to CSV and print them +if results: + df = pd.DataFrame(results) + print(df) + df.to_csv("owasp_top_10.csv", index=False) +else: + print("No OWASP top 10 items found.") + +driver.quit()