Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions assignment4/additional_employees.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[
{"Name": "Eve", "Age": 28, "City": "Miami", "Salary": 60000},
{"Name": "Frank", "Age": 40, "City": "Seattle", "Salary": 95000}
]
186 changes: 186 additions & 0 deletions assignment4/assignment4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
import pandas as pd

# Task 1 - Create a DataFrame from a dictionary
# Create dictionary
data = {
'Name': ['Alice', 'Bob', 'Charlie'],
'Age': [25, 30, 35],
'City': ['New York', 'Los Angeles', 'Chicago']
}

# Convert the dictionary into a DataFrame using Pandas.
task1_data_frame= pd.DataFrame(data)

# Print the DataFrame to verify its creation.
print(task1_data_frame)


# Task1 - Add a new column
# Make a copy of the dataFrame
task1_with_salary = task1_data_frame.copy()

# Add a column called Salary
task1_with_salary['Salary'] = [70000, 80000, 90000]

# Print the new DataFrame
print(task1_with_salary)

# Task1 - Modify an existing column
# Make a copy of task1_with_salary in a variable named task1_older
task1_older = task1_with_salary.copy()

# Increment the Age column by 1 for each entry
task1_older['Age'] = task1_older['Age'] + 1

# Print the modified DataFrame
print(task1_older)

# Task1 - Save the DataFrame as a CSV file
# Save the task1_older DataFrame to a file named employees.csv
task1_older.to_csv('employees.csv', index= False)

# Look at the contents of the CSV file
print("CSV file created!")


# Task 2: Loading Data from CSV and JSON
# Task 2: Read data from a CSV file
# # Load the CSV file from Task 1 into a new DataFrame saved to a variable task2_employees
task2_employees = pd.read_csv('employees.csv')

# Print it and run the tests to verify the contents
print(task2_employees)

# Task 2: Read data from a JSON file

# Create a JSON file (additional_employees.json).
# # json
# [
# {"Name": "Eve", "Age": 28, "City": "Miami", "Salary": 60000},
# {"Name": "Frank", "Age": 40, "City": "Seattle", "Salary": 95000}
# ]
# Load this JSON file into a new DataFrame and assign it to the variable json_employees
# Change this line in assignment4.py
#json_employees = pd.read_json('assignment4/additional_employees.json')
json_employees = pd.read_json('additional_employees.json')

# Print the DataFrame to verify it loaded correctly and run the tests
print(json_employees)

# Task 2: Combine DataFrames
# Combine the data 'assignment4/additional_employees.json' and task2_employees,
# save it as more_employees
more_employees = pd.concat([task2_employees, json_employees], ignore_index= True)

# Print the combined Dataframe and run the tests
print(more_employees)

# Task 3: Data Inspection - Using Head, Tail, and Info Methods
# Task 3:Use the head()

# Assign the first three rows of the more_employees DataFrame to the variable first_three
first_three = more_employees.head(3)

# Print the variable
print(first_three)

# Task 3: Use the tail() method

# Assign the last two rows of the more_employees DataFrame to the variable last_two
last_two = more_employees.tail(2)

# Print the variable
print(last_two)

# Task 3: Get the shape of a Dataframe

# Assign the shape of the more_employees DataFrame to the variable employee_shape
employee_shape = more_employees.shape

# Print the variable
print(employee_shape)

# Task 3: Use the info() method

# Print a concise summary
more_employees.info()


# Task 4: Data Cleaning

# Task 4: Create a DataFrame from dirty_data.csv

# Create a DataFrame from dirty_data.csv file and assign it to the variable dirty_data
dirty_data = pd.read_csv('dirty_data.csv')

# Print dirty_data
print(dirty_data)

# Create a copy of the dirty data
clean_data = dirty_data.copy()

# Task 4: Remove duplicate rows and print
clean_data = clean_data.drop_duplicates()
print(clean_data)

# Task 4: Convert Age to numeric
clean_data['Age'] = pd.to_numeric(clean_data['Age'], errors='coerce')

# handle missing values and print
clean_data['Age'] = clean_data['Age'].fillna(clean_data['Age'].mean())
print(clean_data)

# Task 4 - Convert Salary to numeric, replace known placeholders (unknown, n/a) with NaN and print
clean_data['Salary'] = clean_data['Salary'].replace(['unknown', 'n/a'], pd.NA)
clean_data['Salary'] = pd.to_numeric(clean_data['Salary'], errors= 'coerce')
print(clean_data)


# Task 4 - Fill missing numeric values
# Fill Age which the mean
clean_data['Age'] = clean_data['Age'].fillna(clean_data['Age'].mean())

# Salary with the median
clean_data['Salary'] = clean_data['Salary'].fillna(clean_data['Salary'].median())

print(clean_data)


# Task 4 - Convert Hire Date to datetime
clean_data['Hire Date'] = pd.to_datetime(clean_data['Hire Date'], errors='coerce')

clean_data['Hire Date'] = clean_data['Hire Date'].fillna(method='ffill')
print(clean_data)

# Task 4 - Strip extra whitespace and standardize Name and Department as uppercase
clean_data['Name'] = clean_data['Name'].str.strip().str.upper()
clean_data['Department'] = clean_data['Department'].str.strip().str.upper()

print(clean_data)



























4 changes: 4 additions & 0 deletions assignment4/employees.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Name,Age,City,Salary
Alice,26,New York,70000
Bob,31,Los Angeles,80000
Charlie,36,Chicago,90000
32 changes: 32 additions & 0 deletions assignment8/assignment8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# ==============================================================================
# Task 1: Review robots.txt to Ensure Policy Compliance
# Checked: https://durhamcountylibrary.org/robots.txt
# Policy Assessment:
# - User-agent: * applies to this custom Selenium script.
# - Path '/wp-admin/' is forbidden.
# - Public library content data collection is permitted.
# - Conclusion: The target scraping steps do not breach the site policy.
# ==============================================================================

# ==============================================================================
# Task 2: Understanding HTML and the DOM for the Durham Library Site
# Documented Class Values and HTML Structures:
#
# 1. Single Entry Container:
# - HTML Tag: <li>
# - Class Value: row cp-search-result-item
#
# 2. Title Element:
# - HTML Tag: <span>
# - Class Value: cp-title
#
# 3. Author Element:
# - HTML Tag: <a> (Anchor link)
# - Class Value: author-link
# - Strategy for Multiple Authors: Use find_elements() to capture all instances.
#
# 4. Format and Year Container:
# - Parent HTML Tag: <div>
# - Parent Class Value: manifestation-details
# - Specific Year Tag/Class: span.cp-published-year
# ==============================================================================
27 changes: 27 additions & 0 deletions assignment8/challenges.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
==============================================================================
Assignment 8 - Challenge Log & Resolution Report (challenges.txt)
==============================================================================


Challenge 1 : Deprecated DOM Container IDs and Layout Traps
------------------------------------------------------------------------------
* Problem: Initial inspection notes for Task 2 suggested targeting elements like
"cp-search-result-item" or OWASP's "main_content" ID wrapper. However, recent
client-side framework structural updates on both target sites caused these literal
selectors to return zero matches or throw element collection exceptions.
* Resolution: Broadened search rules away from rigid ID strings. Implemented a
semantic XPath approach ("//li.row.cp-search-result-item" and global "//*[contains()]"
lookups) combined with strict string filter validation hooks (e.g., checking if
text nodes started with official category headers like "A01" through "A10") to
isolate data reliably regardless of layout changes.

Challenge 2: Hidden Type Errors (List objects vs. Flat Strings) in Data Frames
------------------------------------------------------------------------------
* Problem: In early test iterations of the loop, splitting raw string components
accidentally stored data variables inside a Python list object structure inside
individual cells. When Pandas attempted to process and validate columns containing
nested arrays, it threw hidden Type errors that bypassed extraction code blocks and
rendered empty rows or broken tabular fields.
* Resolution: Enforced strict flat text extraction inside independent try/except isolation
blocks. Ensured all extracted nodes were processed into pure strings prior to appending
them to the results dictionary container, which stabilized the data layout.
35 changes: 35 additions & 0 deletions assignment8/ethical_scraping.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
==============================================================================
Task 5: Ethical Web Scraping (Wikipedia Robots.txt Analysis)
==============================================================================

1. Which sections of the website are restricted for crawling?
------------------------------------------------------------------------------
The restricted sections depend on the specific user agent.
However, for general crawlers and specific aggressive bots, heavy restrictions or total bans apply.
- Admin, backend, and technical script paths like /w/, /api/, and /wiki/Special:
are disallowed to save processing bandwidth.
- Query mutation paths, search result loops, and dynamically generated query
URLs (e.g., /?curid=) are disallowed to prevent infinite crawling loops.
- For completely restricted user-agents (such as MJ12bot or UbiCrawler), the
entire root directory ("Disallow: /") is banned from access.

2. Are there specific rules for certain user agents?
------------------------------------------------------------------------------
Yes, Wikipedia specifies unique rules for distinct user agents:
- Outright Bans ("Disallow: /"): Applied to aggressive or non-search crawlers
like MJ12bot, UbiCrawler, DOC, Zao, and advertising bots like Mediapartners-Google*.
- Unlimited Access ("Disallow: "): Applied explicitly to Wikipedia's internal
work, maintenance, and translation bots such as IsraBot and Orthogaffe.
- Global Wildcard ("User-agent: *"): Applies a massive list of disallowed
sub-paths and rate limits to all unspecified scrapers, spiders, and automated
frameworks.

3. Reflection: The Purpose of Robots.txt & Ethical Scraping
------------------------------------------------------------------------------
Websites utilize a robots.txt file to communicate boundaries and access rules
to automated programs, protecting finite server resources from getting overwhelmed
by rapid requests. It promotes ethical scraping by fostering a mutual agreement
of respect between developer automation and site administrators, ensuring data
harvesting doesn't crash a site's infrastructure, breach security, or disrupt
the experience of human users. By checking and obeying these rules, developers
ensure their tools act as good citizens of the web ecosystem.
21 changes: 21 additions & 0 deletions assignment8/get_books.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Title,Author,Format-Year
Real-World Spanish: The Conversation Learning System,Camila Vega Rivera,"Real-World Spanish: The Conversation Learning System, eAudiobook / eAudiobook, 2025"
Learning Spanish-beginner I,"Iris Acevedo A.; Spanishonline, Costarica","Learning Spanish-beginner I, eBook / eBook, 2025 — Spanish / eBook, 2025. Language: Spanish"
100 Facts About Learning Spanish,Science-Based Language Learning Lab,"eAudiobook, 2024 / 100 Facts About Learning Spanish, eAudiobook"
A Beginner's Guide to Learning Spanish,"Miller, Jackson","eAudiobook, 2024 / A Beginner's Guide to Learning Spanish, eAudiobook"
No Tears Spanish Grammar: Easy Learning: Essential Rules for Beginners,"Bennett, Olivia","No Tears Spanish Grammar: Easy Learning: Essential Rules for Beginners, eBook / eBook, 2024"
100 Facts About Learning Spanish,Science-Based Language Learning Lab,"eBook, 2024 / 100 Facts About Learning Spanish, eBook"
Learning Spanish for Adults Beginner,"World, Spain","eBook, 2023 / Learning Spanish for Adults Beginner, eBook"
Learning to Read in English and Spanish Made Easy,"Navarijo, Susie G.","eBook, 2022 / Learning to Read in English and Spanish Made Easy, eBook"
Spanish for Beginners: A Comprehensive Guide for Learning the Spanish Language Fast,Language Equipped Travelers,"eBook, 2021 / Spanish for Beginners: A Comprehensive Guide for Learning the Spanish Language Fast, eBook"
"Spanish: Beginner's Step by Step Course to Quickly Learning the Spanish Language, Spanish Grammar &","Michaels, Steven J.","eBook, 2021 / Spanish: Beginner's Step by Step Course to Quickly Learning the Spanish Language, Spanish Grammar &, eBook"
Learn Spanish Like a Native for Beginners - Level 1: Learning Spanish in Your Car Has Never Been Eas,Learn Like a Native,"Learn Spanish Like a Native for Beginners - Level 1: Learning Spanish in Your Car Has Never Been Eas, eBook / eBook, 2021"
Learn Spanish Like a Native for Beginners - Level 2: Learning Spanish in Your Car Has Never Been Eas,Learn Like a Native,"eBook, 2021 / Learn Spanish Like a Native for Beginners - Level 2: Learning Spanish in Your Car Has Never Been Eas, eBook"
I'm Learning Spanish,"Gardner, James M.","eAudiobook, 2020 — Chinese / I'm Learning Spanish, eAudiobook / eAudiobook, 2020. Language: Chinese"
I am learning Spanish,"Gardner, James M.","I am learning Spanish, eAudiobook / eAudiobook, 2018"
The Best Spanish Learning Games for Children,"Professor, Baby","eBook, 2017 / The Best Spanish Learning Games for Children, eBook"
Easy Learning Spanish Vocabulary,"Dictionaries, Collins","eBook, 2016. Language: Spanish / eBook, 2016 — Spanish / Easy Learning Spanish Vocabulary, eBook"
Spanish Easy Learning Complete Course,"Carmen García del Río; Fitzsimons, Ronan","Spanish Easy Learning Complete Course, eAudiobook / eAudiobook, 2016"
Learning the Local Language: Your Guide to Real World Spanish,"Romey, Jared","Learning the Local Language: Your Guide to Real World Spanish, eBook / eBook, 2013"
Expressing Emotion with the Subjunctive,Unknown Author,"Expressing Emotion with the Subjunctive, Streaming Video / Streaming Video, 2017"
Advanced Work with the Preterite Tense,Unknown Author,"Streaming Video, 2017 / Advanced Work with the Preterite Tense, Streaming Video"
Loading