Geetha82 · Geetha82 · Apr 23, 2026 · Apr 23, 2026 · May 22, 2026
diff --git a/assignment4/additional_employees.json b/assignment4/additional_employees.json
@@ -0,0 +1,4 @@
+[
+    {"Name": "Eve", "Age": 28, "City": "Miami", "Salary": 60000},
+    {"Name": "Frank", "Age": 40, "City": "Seattle", "Salary": 95000}
+]
diff --git a/assignment4/assignment4.py b/assignment4/assignment4.py
@@ -0,0 +1,186 @@
+import pandas as pd
+
+# Task 1 - Create a DataFrame from a dictionary
+# Create dictionary
+data = {
+    'Name': ['Alice', 'Bob', 'Charlie'],
+    'Age': [25, 30, 35],
+    'City': ['New York', 'Los Angeles', 'Chicago']
+}
+
+# Convert the dictionary into a DataFrame using Pandas.
+task1_data_frame= pd.DataFrame(data)
+
+# Print the DataFrame to verify its creation.
+print(task1_data_frame)
+
+
+# Task1 - Add a new column
+# Make a copy of the dataFrame
+task1_with_salary = task1_data_frame.copy()
+
+# Add a column called Salary
+task1_with_salary['Salary'] = [70000, 80000, 90000]
+
+# Print the new DataFrame
+print(task1_with_salary)
+
+# Task1 - Modify an existing column
+# Make a copy of task1_with_salary in a variable named task1_older
+task1_older = task1_with_salary.copy()
+
+# Increment the Age column by 1 for each entry
+task1_older['Age'] = task1_older['Age'] + 1
+
+# Print the modified DataFrame
+print(task1_older)
+
+# Task1 - Save the DataFrame as a CSV file
+# Save the task1_older DataFrame to a file named employees.csv
+task1_older.to_csv('employees.csv', index= False)
+
+# Look at the contents of the CSV file 
+print("CSV file created!")
+
+
+# Task 2: Loading Data from CSV and JSON
+# Task 2: Read data from a CSV file
+# # Load the CSV file from Task 1 into a new DataFrame saved to a variable task2_employees
+task2_employees  = pd.read_csv('employees.csv')
+
+# Print it and run the tests to verify the contents
+print(task2_employees)
+
+# Task 2: Read data from a JSON file
+
+# Create a JSON file (additional_employees.json). 
+# # json
+# [
+#     {"Name": "Eve", "Age": 28, "City": "Miami", "Salary": 60000},
+#     {"Name": "Frank", "Age": 40, "City": "Seattle", "Salary": 95000}
+# ]
+# Load this JSON file into a new DataFrame and assign it to the variable json_employees
+# Change this line in assignment4.py
+#json_employees = pd.read_json('assignment4/additional_employees.json')
+json_employees = pd.read_json('additional_employees.json')
+
+# Print the DataFrame to verify it loaded correctly and run the tests
+print(json_employees)
+
+# Task 2: Combine DataFrames
+# Combine the data 'assignment4/additional_employees.json'  and task2_employees, 
+# save it as more_employees
+more_employees = pd.concat([task2_employees, json_employees], ignore_index= True)
+
+# Print the combined Dataframe and run the tests
+print(more_employees)
+
+# Task 3: Data Inspection - Using Head, Tail, and Info Methods
+# Task 3:Use the head() 
+
+# Assign the first three rows of the more_employees DataFrame to the variable first_three
+first_three = more_employees.head(3)
+
+# Print the variable
+print(first_three)
+
+# Task 3: Use the tail() method
+
+# Assign the last two rows of the more_employees DataFrame to the variable last_two
+last_two = more_employees.tail(2)
+
+# Print the variable
+print(last_two)
+
+# Task 3: Get the shape of a Dataframe
+
+# Assign the shape of the more_employees DataFrame to the variable employee_shape
+employee_shape = more_employees.shape
+
+# Print the variable
+print(employee_shape)
+
+# Task 3: Use the info() method
+
+# Print a concise summary 
+more_employees.info()
+
+
+# Task 4: Data Cleaning
+
+# Task 4: Create a DataFrame from dirty_data.csv
+
+# Create a DataFrame from dirty_data.csv file and assign it to the variable dirty_data
+dirty_data = pd.read_csv('dirty_data.csv')
+
+# Print dirty_data
+print(dirty_data)
+
+# Create a copy of the dirty data
+clean_data = dirty_data.copy()
+
+# Task 4: Remove duplicate rows and print
+clean_data = clean_data.drop_duplicates()
+print(clean_data)
+
+# Task 4: Convert Age to numeric
+clean_data['Age'] = pd.to_numeric(clean_data['Age'], errors='coerce')
+
+# handle missing values and print
+clean_data['Age'] = clean_data['Age'].fillna(clean_data['Age'].mean())
+print(clean_data)
+
+# Task 4 - Convert Salary to numeric, replace known placeholders (unknown, n/a) with NaN and print
+clean_data['Salary'] = clean_data['Salary'].replace(['unknown', 'n/a'], pd.NA) 
+clean_data['Salary'] = pd.to_numeric(clean_data['Salary'], errors= 'coerce')
+print(clean_data)
+
+
+# Task 4 - Fill missing numeric values
+# Fill Age which the mean
+clean_data['Age'] = clean_data['Age'].fillna(clean_data['Age'].mean())
+
+# Salary with the median
+clean_data['Salary'] = clean_data['Salary'].fillna(clean_data['Salary'].median())
+
+print(clean_data)
+
+
+# Task 4 - Convert Hire Date to datetime
+clean_data['Hire Date'] = pd.to_datetime(clean_data['Hire Date'], errors='coerce')
+
+clean_data['Hire Date'] = clean_data['Hire Date'].fillna(method='ffill')
+print(clean_data)
+
+# Task 4 - Strip extra whitespace and standardize Name and Department as uppercase
+clean_data['Name'] = clean_data['Name'].str.strip().str.upper()
+clean_data['Department'] = clean_data['Department'].str.strip().str.upper()
+
+print(clean_data)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/assignment4/employees.csv b/assignment4/employees.csv
@@ -0,0 +1,4 @@
+Name,Age,City,Salary
+Alice,26,New York,70000
+Bob,31,Los Angeles,80000
+Charlie,36,Chicago,90000
diff --git a/assignment8/assignment8.py b/assignment8/assignment8.py
@@ -0,0 +1,32 @@
+# ==============================================================================
+# Task 1: Review robots.txt to Ensure Policy Compliance
+# Checked: https://durhamcountylibrary.org/robots.txt
+# Policy Assessment: 
+#   - User-agent: * applies to this custom Selenium script.
+#   - Path '/wp-admin/' is forbidden.
+#   - Public library content data collection is permitted.
+#   - Conclusion: The target scraping steps do not breach the site policy.
+# ==============================================================================
+
+# ==============================================================================
+# Task 2: Understanding HTML and the DOM for the Durham Library Site
+# Documented Class Values and HTML Structures:
+# 
+# 1. Single Entry Container:
+#    - HTML Tag: <li>
+#    - Class Value: row cp-search-result-item
+#
+# 2. Title Element:
+#    - HTML Tag: <span>
+#    - Class Value: cp-title
+#
+# 3. Author Element:
+#    - HTML Tag: <a> (Anchor link)
+#    - Class Value: author-link
+#    - Strategy for Multiple Authors: Use find_elements() to capture all instances.
+#
+# 4. Format and Year Container:
+#    - Parent HTML Tag: <div>
+#    - Parent Class Value: manifestation-details
+#    - Specific Year Tag/Class: span.cp-published-year
+# ==============================================================================
diff --git a/assignment8/challenges.txt b/assignment8/challenges.txt
@@ -0,0 +1,27 @@
+==============================================================================
+Assignment 8 - Challenge Log & Resolution Report (challenges.txt)
+==============================================================================
+
+
+Challenge 1 : Deprecated DOM Container IDs and Layout Traps
+------------------------------------------------------------------------------
+* Problem: Initial inspection notes for Task 2 suggested targeting elements like 
+  "cp-search-result-item" or OWASP's "main_content" ID wrapper. However, recent 
+  client-side framework structural updates on both target sites caused these literal 
+  selectors to return zero matches or throw element collection exceptions.
+* Resolution: Broadened search rules away from rigid ID strings. Implemented a 
+  semantic XPath approach ("//li.row.cp-search-result-item" and global "//*[contains()]" 
+  lookups) combined with strict string filter validation hooks (e.g., checking if 
+  text nodes started with official category headers like "A01" through "A10") to 
+  isolate data reliably regardless of layout changes.
+
+Challenge 2: Hidden Type Errors (List objects vs. Flat Strings) in Data Frames
+------------------------------------------------------------------------------
+* Problem: In early test iterations of the loop, splitting raw string components 
+  accidentally stored data variables inside a Python list object structure inside 
+  individual cells. When Pandas attempted to process and validate columns containing 
+  nested arrays, it threw hidden Type errors that bypassed extraction code blocks and 
+  rendered empty rows or broken tabular fields.
+* Resolution: Enforced strict flat text extraction inside independent try/except isolation 
+  blocks. Ensured all extracted nodes were processed into pure strings prior to appending 
+  them to the results dictionary container, which stabilized the data layout.
diff --git a/assignment8/ethical_scraping.txt b/assignment8/ethical_scraping.txt
@@ -0,0 +1,35 @@
+==============================================================================
+Task 5: Ethical Web Scraping (Wikipedia Robots.txt Analysis)
+==============================================================================
+
+1. Which sections of the website are restricted for crawling?
+------------------------------------------------------------------------------
+The restricted sections depend on the specific user agent.
+ However, for general crawlers and specific aggressive bots, heavy restrictions or total bans apply.
+- Admin, backend, and technical script paths like /w/, /api/, and /wiki/Special:
+  are disallowed to save processing bandwidth.
+- Query mutation paths, search result loops, and dynamically generated query 
+  URLs (e.g., /?curid=) are disallowed to prevent infinite crawling loops.
+- For completely restricted user-agents (such as MJ12bot or UbiCrawler), the 
+  entire root directory ("Disallow: /") is banned from access.
+
+2. Are there specific rules for certain user agents?
+------------------------------------------------------------------------------
+Yes, Wikipedia specifies unique rules for distinct user agents:
+- Outright Bans ("Disallow: /"): Applied to aggressive or non-search crawlers
+  like MJ12bot, UbiCrawler, DOC, Zao, and advertising bots like Mediapartners-Google*.
+- Unlimited Access ("Disallow: "): Applied explicitly to Wikipedia's internal
+  work, maintenance, and translation bots such as IsraBot and Orthogaffe.
+- Global Wildcard ("User-agent: *"): Applies a massive list of disallowed 
+  sub-paths and rate limits to all unspecified scrapers, spiders, and automated
+  frameworks.
+
+3. Reflection: The Purpose of Robots.txt & Ethical Scraping
+------------------------------------------------------------------------------
+Websites utilize a robots.txt file to communicate boundaries and access rules 
+to automated programs, protecting finite server resources from getting overwhelmed 
+by rapid requests. It promotes ethical scraping by fostering a mutual agreement 
+of respect between developer automation and site administrators, ensuring data 
+harvesting doesn't crash a site's infrastructure, breach security, or disrupt 
+the experience of human users. By checking and obeying these rules, developers 
+ensure their tools act as good citizens of the web ecosystem.
diff --git a/assignment8/get_books.csv b/assignment8/get_books.csv
@@ -0,0 +1,21 @@
+Title,Author,Format-Year
+Real-World Spanish: The Conversation Learning System,Camila Vega Rivera,"Real-World Spanish: The Conversation Learning System, eAudiobook / eAudiobook, 2025"
+Learning Spanish-beginner I,"Iris Acevedo A.; Spanishonline, Costarica","Learning Spanish-beginner I, eBook / eBook, 2025 — Spanish / eBook, 2025. Language: Spanish"
+100 Facts About Learning Spanish,Science-Based Language Learning Lab,"eAudiobook, 2024 / 100 Facts About Learning Spanish, eAudiobook"
+A Beginner's Guide to Learning Spanish,"Miller, Jackson","eAudiobook, 2024 / A Beginner's Guide to Learning Spanish, eAudiobook"
+No Tears Spanish Grammar: Easy Learning: Essential Rules for Beginners,"Bennett, Olivia","No Tears Spanish Grammar: Easy Learning: Essential Rules for Beginners, eBook / eBook, 2024"
+100 Facts About Learning Spanish,Science-Based Language Learning Lab,"eBook, 2024 / 100 Facts About Learning Spanish, eBook"
+Learning Spanish for Adults Beginner,"World, Spain","eBook, 2023 / Learning Spanish for Adults Beginner, eBook"
+Learning to Read in English and Spanish Made Easy,"Navarijo, Susie G.","eBook, 2022 / Learning to Read in English and Spanish Made Easy, eBook"
+Spanish for Beginners: A Comprehensive Guide for Learning the Spanish Language Fast,Language Equipped Travelers,"eBook, 2021 / Spanish for Beginners: A Comprehensive Guide for Learning the Spanish Language Fast, eBook"
+"Spanish: Beginner's Step by Step Course to Quickly Learning the Spanish Language, Spanish Grammar &","Michaels, Steven J.","eBook, 2021 / Spanish: Beginner's Step by Step Course to Quickly Learning the Spanish Language, Spanish Grammar &, eBook"
+Learn Spanish Like a Native for Beginners - Level 1: Learning Spanish in Your Car Has Never Been Eas,Learn Like a Native,"Learn Spanish Like a Native for Beginners - Level 1: Learning Spanish in Your Car Has Never Been Eas, eBook / eBook, 2021"
+Learn Spanish Like a Native for Beginners - Level 2: Learning Spanish in Your Car Has Never Been Eas,Learn Like a Native,"eBook, 2021 / Learn Spanish Like a Native for Beginners - Level 2: Learning Spanish in Your Car Has Never Been Eas, eBook"
+I'm Learning Spanish,"Gardner, James M.","eAudiobook, 2020 — Chinese / I'm Learning Spanish, eAudiobook / eAudiobook, 2020. Language: Chinese"
+I am learning Spanish,"Gardner, James M.","I am learning Spanish, eAudiobook / eAudiobook, 2018"
+The Best Spanish Learning Games for Children,"Professor, Baby","eBook, 2017 / The Best Spanish Learning Games for Children, eBook"
+Easy Learning Spanish Vocabulary,"Dictionaries, Collins","eBook, 2016. Language: Spanish / eBook, 2016 — Spanish / Easy Learning Spanish Vocabulary, eBook"
+Spanish Easy Learning Complete Course,"Carmen García del Río; Fitzsimons, Ronan","Spanish Easy Learning Complete Course, eAudiobook / eAudiobook, 2016"
+Learning the Local Language: Your Guide to Real World Spanish,"Romey, Jared","Learning the Local Language: Your Guide to Real World Spanish, eBook / eBook, 2013"
+Expressing Emotion with the Subjunctive,Unknown Author,"Expressing Emotion with the Subjunctive, Streaming Video / Streaming Video, 2017"
+Advanced Work with the Preterite Tense,Unknown Author,"Streaming Video, 2017 / Advanced Work with the Preterite Tense, Streaming Video"