Skip to content

Commit fa68154

Browse files
authored
Update README.md
1 parent dd64c02 commit fa68154

File tree

1 file changed

+37
-29
lines changed

1 file changed

+37
-29
lines changed

README.md

Lines changed: 37 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ over large datasets
88
* About dataset
99
* Environment
1010
* Extract the Data
11-
* Create Sqoop Job
11+
* Hive Querying
1212

1313
<hr>
1414

@@ -52,13 +52,15 @@ mv ml-1m/users.dat /ml-1m/users.csv
5252
5353
* Create movies.sql,ratings.sql,users.sql
5454
```
55-
nano movies.sql >
55+
nano movies.sql
5656
nano ratings.sql
5757
nano users.sql
5858
```
5959
6060
Copy SQL code from the repo files movies.sql,ratings.sql,users.sql
61-
<code> hive -f users.sql </code>
61+
```
62+
hive -f users.sql
63+
```
6264
6365
![image](https://user-images.githubusercontent.com/69738890/95402545-a1c48800-08d5-11eb-9b59-3a7051eaea5c.png)
6466
@@ -67,53 +69,59 @@ OR manually execute the commands in the hive shell as shown below
6769
![image](https://user-images.githubusercontent.com/69738890/95404381-7bedb200-08da-11eb-8aee-cb0f2d432d13.png)
6870
6971
# EXPLORED QUESTIONS
70-
##### Top 10 viewed movies</br>
71-
<CODE>
72+
##### Top 10 viewed movies
73+
```
7274
SELECT movies.MovieID,movies.Title,COUNT(DISTINCT ratings.UserID) as views
7375
FROM movies JOIN ratings ON (movies.MovieID = ratings.MovieID)
7476
GROUP BY movies.MovieID, movies.Title
7577
ORDER BY views DESC
7678
LIMIT 10;
77-
</CODE>
78-
</BR>
79-
79+
```
8080
![image](https://user-images.githubusercontent.com/69738890/95404826-bb68ce00-08db-11eb-94c1-bbf7bca70d1c.png)
8181
82-
</BR>
83-
##### Top 20 rated movies having at least 40 views</br>
84-
<CODE>
82+
#### Top 20 rated movies having at least 40 views
83+
```
8584
SELECT movies.MovieID,movies.Title,AVG(ratings.Rating) as rtg,COUNT(DISTINCT ratings.UserID) as views
8685
FROM movies JOIN ratings ON (movies.MovieID = ratings.MovieID)
8786
GROUP BY movies.MovieID,movies.Title
8887
HAVING views >= 40
8988
ORDER BY rtg DESC
9089
LIMIT 20;
9190
</CODE>
92-
</br>
93-
91+
```
9492
![image](https://user-images.githubusercontent.com/69738890/95405157-a3457e80-08dc-11eb-8b6b-b07bdaba0533.png)
9593
96-
</br>
97-
98-
Create exploded view of movie id and genre</br>
99-
<CODE>
100-
create view movie_by_genre as select movieid, genre from (select movieid, split(genres, '\\|') genres from movies) t lateral view explode(genres) t as genre;
101-
<CODE>
102-
</br>
94+
#### Create exploded view of movie id and genre
95+
```
96+
CREATE view movie_by_genre as SELECT movieid, genre FROM
97+
(
98+
SELECT movieid, split(genres, '\\|') genres FROM movies
99+
) t LATERAL VIEW EXPLODE(genres) t as genre;
100+
```
103101
104102
![image](https://user-images.githubusercontent.com/69738890/95405324-18b14f00-08dd-11eb-971d-3ac31f693342.png)
105103
104+
#### Find top 3 genres for each user
105+
```
106+
CREATE TEMPORARY TABLE movie_by_user_genre as
107+
SELECT t1.*, t2.rating,t2.userid
108+
FROM movie_by_genre t1 LEFT JOIN ratings t2
109+
ON t1.movieid = t2.movieid WHERE t2.rating >= 4;
110+
```
106111
107-
Find top 3 genres for each user</br>
108-
<CODE>
109-
create temporary table movie_by_user_genre as select t1.*, t2.rating,t2.userid from movie_by_genre t1 left join ratings t2 on t1.movieid = t2.movieid where t2.rating >= 4;
110-
111-
create temporary table user_by_genre_totalrating as select userid, genre, sum(rating) total_rating from movie_by_user_genre group by userid, genre;
112+
```
113+
CREATE TEMPORARY TABLE user_by_genre_totalrating as
114+
SELECT userid, genre, sum(rating) total_rating
115+
FROM movie_by_user_genre GROUP BY userid, genre;
116+
```
112117
113-
select * from
114-
(select userid, genre, row_number() over (partition by userid order by total_rating desc) row_num from user_by_genre_totalrating) t where t.row_num <= 3;
115-
</CODE>
116-
</br>
118+
```
119+
SELECT * FROM
120+
(SELECT userid, genre, ROW_NUMBER() OVER (PARTITION by userid ORDER BY total_rating desc) row_num
121+
FROM user_by_genre_totalrating) t
122+
WHERE t.row_num <= 3;
123+
```
124+
![image](https://user-images.githubusercontent.com/69738890/95407159-dfc7a900-08e1-11eb-91b5-92f0d76c4dc2.png)
117125
118126
119127

0 commit comments

Comments
 (0)