BigData3 基于HIVE的数据分析

作业

热门问题:老师,我作业数据呢?

在希冀平台实验2中,找到它的作业那一栏,能注意到左上角突然出现了 实验数据 一栏。
1763436689806.png

然后就可以载入数据了。

hive不支持使用##作为分隔符,采取了正则匹配成功加载了数据。

first edition was made by deepseek
fix by fengling
thank for @zc.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
-- =============================================
-- 1. Create Tables and Import Data
-- =============================================

set hive.cli.print.header=true;

-- Create users table
CREATE TABLE users (
userid BIGINT,
gender STRING,
age INT,
occupation STRING,
zipcode STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
'input.regex' = '(\\d+)::(\\w)::(\\d+)::(\\d+)::(\\S+)'
)
STORED AS TEXTFILE;

-- Create movies table
CREATE TABLE movies (
movieid BIGINT,
title STRING,
genres STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
'input.regex' = '(\\d+)::(.+)::(.+)'
)
STORED AS TEXTFILE;

-- Create ratings table
CREATE TABLE ratings (
userid BIGINT,
movieid BIGINT,
rating DOUBLE,
timestamped STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
'input.regex' = '(\\d+)::(\\d+)::(\\d+)::(\\d+)'
)
STORED AS TEXTFILE;

-- LOAD DATA LOCAL INPATH '/mnt/cgshare/ratings.dat' OVERWRITE INTO TABLE ratings;--

-- Load data into tables
LOAD DATA LOCAL INPATH '/mnt/cgshare/users.dat' INTO TABLE users;
LOAD DATA LOCAL INPATH '/mnt/cgshare/movies.dat' INTO TABLE movies;
LOAD DATA LOCAL INPATH '/mnt/cgshare/ratings.dat' INTO TABLE ratings;

-- Verify data loading
SELECT 'Users count: ' || COUNT(*) FROM users;
SELECT 'Movies count: ' || COUNT(*) FROM movies;
SELECT 'Ratings count: ' || COUNT(*) FROM ratings;

-- =============================================
-- 2. Top 10 Most Rated Movies
-- =============================================

SELECT
m.title AS movie_name,
COUNT(*) AS rating_count
FROM
ratings r
JOIN
movies m ON r.movieid = m.movieid
GROUP BY
m.title
ORDER BY
rating_count DESC
LIMIT 10;

-- =============================================
-- 3. Top 10 Movies by Gender
-- =============================================

-- For males
SELECT
'M' AS gender,
m.title AS movie_name,
AVG(r.rating) AS avg_rating
FROM
ratings r
JOIN
movies m ON r.movieid = m.movieid
JOIN
users u ON r.userid = u.userid
WHERE
u.gender = 'M'
GROUP BY
m.title
ORDER BY
avg_rating DESC
LIMIT 10;

-- For females
SELECT
'F' AS gender,
m.title AS movie_name,
AVG(r.rating) AS avg_rating
FROM
ratings r
JOIN
movies m ON r.movieid = m.movieid
JOIN
users u ON r.userid = u.userid
WHERE
u.gender = 'F'
GROUP BY
m.title
ORDER BY
avg_rating DESC
LIMIT 10;

-- =============================================
-- 4. Average Rating for Movie ID 2116 by Age Group
-- =============================================

SELECT
u.age AS age_group,
AVG(r.rating) AS avg_rating
FROM
ratings r
JOIN
users u ON r.userid = u.userid
WHERE
r.movieid = 2116
GROUP BY
u.age
ORDER BY
u.age;

-- =============================================
-- 5. Top 10 Highest Rated Movies by Most Active Female User
-- =============================================

WITH top_female_reviewer AS (
    SELECT
        u.userid,
        COUNT(r.rating) AS review_count
    FROM
        users u
    JOIN
        ratings r ON u.userid = r.userid
    WHERE
        u.gender = 'F'
    GROUP BY
        u.userid
    ORDER BY
        review_count DESC
    LIMIT 1
),
top_10_movies_by_her AS (
    SELECT
        r.movieid,
        r.rating,
        r.timestamped 
    FROM
        ratings r
    JOIN
        top_female_reviewer tfr ON r.userid = tfr.userid
    ORDER BY
        r.rating DESC, r.timestamped DESC
    LIMIT 10
)
SELECT
    tfr.userid AS reviewer_id,
    m.title,
    AVG(r.rating) AS overall_avg_rating
FROM
    ratings r
JOIN
    movies m ON r.movieid = m.movieid
JOIN
    top_10_movies_by_her t10 ON r.movieid = t10.movieid
CROSS JOIN 
    top_female_reviewer tfr
GROUP BY
    tfr.userid, m.title
ORDER BY
    overall_avg_rating DESC;

-- =============================================
-- 6. Top 10 Best Movies from Year with Most High-Rated Movies (Rating >= 4.0)
-- =============================================

WITH movie_with_year AS (
    SELECT
        movieid,
        title,
        regexp_extract(title, '.*\\((\\d{4})\\).*', 1) AS year
    FROM
        movies
),
year_with_good_ratings_count AS (
    SELECT
        mwy.year,
        COUNT(1) as good_rating_count
    FROM
        ratings r
    JOIN
        movie_with_year mwy ON r.movieid = mwy.movieid
    WHERE
        r.rating >= 4.0 AND mwy.year IS NOT NULL AND mwy.year != ''
    GROUP BY
        mwy.year
),
best_year AS (
    SELECT
        year,
        good_rating_count
    FROM
        year_with_good_ratings_count
    ORDER BY
        good_rating_count DESC
    LIMIT 1
)
SELECT
    mwy.title,
    AVG(r.rating) AS avg_rating
FROM
    ratings r
JOIN
    movie_with_year mwy ON r.movieid = mwy.movieid
JOIN
    best_year byear ON mwy.year = byear.year
GROUP BY
    mwy.title
ORDER BY
    avg_rating DESC
LIMIT 10;

-- =============================================
-- 7. Top 10 Comedy Movies from 1997
-- =============================================

SELECT
m.title AS movie_name,
AVG(r.rating) AS avg_rating
FROM
ratings r
JOIN
movies m ON r.movieid = m.movieid
WHERE
m.title LIKE '%(1997)%'
AND m.genres LIKE '%Comedy%'
GROUP BY
m.title
ORDER BY
avg_rating DESC
LIMIT 10;

-- =============================================
-- 8. Top 5 Movies for Each Genre
-- =============================================

WITH exploded_genres AS (
    SELECT
        m.movieid,
        m.title,
        single_genre
    FROM
        movies m
    LATERAL VIEW explode(split(m.genres, '\\|')) t AS single_genre
),
movie_avg_ratings AS (
    SELECT
        eg.single_genre,
        eg.title,
        AVG(r.rating) as avg_rating
    FROM
        ratings r
    JOIN
        exploded_genres eg ON r.movieid = eg.movieid
    GROUP BY
        eg.single_genre, eg.title
)
SELECT
    single_genre,
    title,
    avg_rating
FROM (
    SELECT
        single_genre,
        title,
        avg_rating,
        ROW_NUMBER() OVER(PARTITION BY single_genre ORDER BY avg_rating DESC) as rn
    FROM
        movie_avg_ratings
) ranked_movies
WHERE
    rn <= 5;

-- =============================================
-- 9. Highest Rated Genre by Year
-- =============================================


WITH movie_with_year_and_genre AS (
    SELECT
        regexp_extract(m.title, '.*\\((\\d{4})\\).*', 1) AS year,
        single_genre,
        m.movieid
    FROM
        movies m
    LATERAL VIEW explode(split(m.genres, '\\|')) t AS single_genre
    WHERE
        regexp_extract(m.title, '.*\\((\\d{4})\\).*', 1) != ''
),
yearly_genre_ratings AS (
    SELECT
        myg.year,
        myg.single_genre,
        AVG(r.rating) as avg_rating
    FROM
        ratings r
    JOIN
        movie_with_year_and_genre myg ON r.movieid = myg.movieid
    GROUP BY
        myg.year, myg.single_genre
)
SELECT
    year,
    single_genre,
    avg_rating
FROM (
    SELECT
        year,
        single_genre,
        avg_rating,
        ROW_NUMBER() OVER(PARTITION BY year ORDER BY avg_rating DESC) as rn
    FROM
        yearly_genre_ratings
) ranked_genres
WHERE
    rn = 1
ORDER BY
    year DESC;

-- =============================================
-- 10. Highest Rated Movie by Region (Zipcode) - Save to HDFS
-- =============================================

SET hive.cli.print.header=true;

WITH region_movie_ratings AS (
    SELECT
        u.zipcode,
        m.title,
        AVG(r.rating) AS avg_rating
    FROM
        ratings r
    JOIN
        users u ON r.userid = u.userid
    JOIN
        movies m ON r.movieid = m.movieid
    WHERE u.zipcode IS NOT NULL AND u.zipcode != ''
    GROUP BY
        u.zipcode, m.title
),
ranked_region_movies AS (
    SELECT
        zipcode,
        title,
        avg_rating,
        ROW_NUMBER() OVER(PARTITION BY zipcode ORDER BY avg_rating DESC) as rn
    FROM
        region_movie_ratings
)
INSERT OVERWRITE DIRECTORY '/user/hive/movielens_output/top_movie_by_region'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
SELECT
    zipcode,
    title,
    avg_rating
FROM
    ranked_region_movies
WHERE
    rn = 1;

-- =============================================
-- 11. 分区表设计与数据加载 
-- =============================================

SET hive.exec.dynamic.partition=true;
SET hive.exec.dynamic.partition.mode=nonstrict;

CREATE TABLE movies_partitioned (
    MovieID BIGINT,
    Title STRING,
    Genres STRING
)
PARTITIONED BY (release_year INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '::'
STORED AS TEXTFILE;INSERT OVERWRITE TABLE movies_partitioned PARTITION(release_year)
SELECT
    MovieID,
    Title,
    Genres,
    CAST(regexp_extract(title, '.*\\((\\d{4})\\).*', 1) AS INT) as release_year
FROM
    movies
WHERE
    regexp_extract(title, '.*\\((\\d{4})\\).*', 1) != '';

-- =============================================
-- 12. 桶表创建与数据存储 
-- =============================================

SET hive.enforce.bucketing = true;


CREATE TABLE ratings_bucketed (
    UserID BIGINT,
    MovieID BIGINT,
    Rating DOUBLE,
    Timestamped STRING
)
CLUSTERED BY (UserID) SORTED BY (MovieID) INTO 8 BUCKETS
ROW FORMAT DELIMITED FIELDS TERMINATED BY '::'
STORED AS TEXTFILE;

INSERT OVERWRITE TABLE ratings_bucketed
SELECT
    UserID,
    MovieID,
    Rating,
    Timestamped
FROM
    ratings;

-- =============================================
-- 13. 抽样分析实践 
-- =============================================


SELECT
    AVG(Rating) AS avg_rating,
    MAX(Rating) AS max_rating,
    MIN(Rating) AS min_rating
FROM
    ratings_bucketed TABLESAMPLE (10 PERCENT);

SELECT
    SUM(CASE WHEN Rating >= 4.0 THEN 1 ELSE 0 END) / COUNT(1) AS high_rating_ratio
FROM
    ratings_bucketed TABLESAMPLE (BUCKET 3 OUT OF 8 ON UserID);

SELECT
    UserID,
    MovieID,
    Rating
FROM
    ratings_bucketed TABLESAMPLE (BUCKET 3 OUT OF 8 ON UserID)
ORDER BY
    Rating DESC
LIMIT 3;