-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWorld_Life_Expectancy_SQL.sql
More file actions
151 lines (128 loc) · 4.45 KB
/
World_Life_Expectancy_SQL.sql
File metadata and controls
151 lines (128 loc) · 4.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# World Life Expectancy Project
# ----------------------------
# Setup and Initial Inspection
# ----------------------------
USE world_life_expectancy;
# Preview the data to understand its structure
SELECT * FROM world_life_expectancy;
# -----------------------
# Data Cleaning
# -----------------------
# 1. Remove Duplicate Records
# Check for duplicates based on Country and Year and delete them
SELECT *
FROM (
SELECT Row_ID,
CONCAT(Country, Year),
ROW_NUMBER() OVER(PARTITION BY CONCAT(Country, Year) ORDER BY CONCAT(Country, Year)) AS Row_Num
FROM world_life_expectancy
) AS Row_table
WHERE Row_Num > 1;
DELETE FROM world_life_expectancy
WHERE Row_ID IN (
SELECT Row_ID
FROM (
SELECT Row_ID,
CONCAT(Country, Year),
ROW_NUMBER() OVER(PARTITION BY CONCAT(Country, Year) ORDER BY CONCAT(Country, Year)) AS Row_Num
FROM world_life_expectancy
) AS Row_table
WHERE Row_Num > 1
);
# 2. Check for Missing or Inconsistent Status Values
# Look for rows where status is empty
SELECT *
FROM world_life_expectancy
WHERE status = '';
# Confirm available status types and ensure consistency
SELECT DISTINCT(status)
FROM world_life_expectancy
WHERE status <> '';
# Update missing status values based on other rows with the same Country
UPDATE world_life_expectancy t1
INNER JOIN world_life_expectancy t2
ON t1.Country = t2.Country
SET t1.Status = 'Developing'
WHERE t1.Status = ''
AND t2.Status = 'Developing';
UPDATE world_life_expectancy t1
INNER JOIN world_life_expectancy t2
ON t1.Country = t2.Country
SET t1.Status = 'Developed'
WHERE t1.Status = ''
AND t2.Status = 'Developed';
# Re-check for any remaining empty status rows
SELECT *
FROM world_life_expectancy
WHERE Status = '' OR Status IS NULL;
# 3. Handle Missing Life Expectancy Data
# Identify records where Life Expectancy is missing
SELECT *
FROM world_life_expectancy
WHERE `Life expectancy` = '';
# Calculate and update missing Life Expectancy using the average of adjacent years
UPDATE world_life_expectancy t1
JOIN world_life_expectancy t2
ON t1.Country = t2.Country
AND t1.Year = t2.Year - 1
JOIN world_life_expectancy t3
ON t1.Country = t3.Country
AND t1.Year = t3.Year + 1
SET t1.`Life expectancy` = ROUND((t2.`Life expectancy` + t3.`Life expectancy`) / 2, 1)
WHERE t1.`Life expectancy` = '';
# -----------------------------
# Exploratory Data Analysis
# -----------------------------
# 1. General Overview
# Preview the cleaned dataset
SELECT * FROM world_life_expectancy;
# 2. Life Expectancy Trends by Country
# Determine max and min life expectancy and increase over 15 years for each country
SELECT Country,
MIN(`Life Expectancy`) AS Min_Life_Exp,
MAX(`Life Expectancy`) AS Max_Life_Exp,
ROUND(MAX(`Life Expectancy`) - MIN(`Life Expectancy`), 1) AS Life_Exp_Change
FROM world_life_expectancy
GROUP BY Country
HAVING Min_Life_Exp <> 0 AND Max_Life_Exp <> 0
ORDER BY Life_Exp_Change ASC;
# 3. Global Life Expectancy Trends
# Find average life expectancy over time
SELECT Year, ROUND(AVG(`Life Expectancy`), 2) AS Avg_Life_Exp
FROM world_life_expectancy
WHERE `Life Expectancy` <> 0
GROUP BY Year
ORDER BY Year;
# 4. GDP Correlation Analysis
# Explore life expectancy in relation to GDP and status
# GDP vs. Life Expectancy by Country
SELECT Country,
ROUND(AVG(`Life Expectancy`), 1) AS Avg_Life_Exp,
ROUND(AVG(GDP), 1) AS Avg_GDP
FROM world_life_expectancy
GROUP BY Country
HAVING Avg_Life_Exp > 0 AND Avg_GDP > 0
ORDER BY Avg_GDP DESC;
# Compare life expectancy for countries with high and low GDP
SELECT
SUM(CASE WHEN GDP >= 1500 THEN 1 ELSE 0 END) AS High_GDP_Count,
AVG(CASE WHEN GDP >= 1500 THEN `Life Expectancy` ELSE NULL END) AS High_GDP_Life_Exp,
SUM(CASE WHEN GDP < 1500 THEN 1 ELSE 0 END) AS Low_GDP_Count,
AVG(CASE WHEN GDP < 1500 THEN `Life Expectancy` ELSE NULL END) AS Low_GDP_Life_Exp
FROM world_life_expectancy;
# 5. Status-based Analysis
# Explore life expectancy averages by development status
SELECT Status,
ROUND(AVG(`Life Expectancy`), 1) AS Avg_Life_Exp
FROM world_life_expectancy
GROUP BY Status;
# 6. Rolling Totals for Adult Mortality
# Calculate cumulative sum of Adult Mortality by country and year
SELECT Country,
Year,
`Life Expectancy`,
`Adult Mortality`,
SUM(`Adult Mortality`) OVER (PARTITION BY Country ORDER BY Year) AS Rolling_Total
FROM world_life_expectancy
WHERE Country LIKE '%United%';
# End of Exploratory Data Analysis