Hive|Students表在Hive的多表查询

1.资料准备

资料下载:https://wwyl.lanzoum.com/isI3P0zrssnc

字段解释:

课程表:

CODE NAME PERIOD CREDIT
课程编号 课程名称 课时 学分

任课表:

CODE SEMESTER TEACHER_ID COURSE_CODE
课程编号 学期 任课老师编号 与上表CODE对应

专业表

ID COLLEGE MAJOR GRADE CLASS TYPE
编号 学院 学号 年级 班级 类型

学生表

ID NAME GENDER BIRTHDATE PHONENUMBER MAJOR_CLASS
学生编号 姓名 性别 生日 手机号 专业班级

选课表

COURSE_CLASS_CODE STUDENT_ID SCORE
与表2的CODE对应 学生编号 成绩

教师表

ID NAME GENDER TITLE
教师编号 姓名 性别 职称

2.日常操作

(1)启动、新建文件夹

1
2
start-all.sh
hadoop fs -mkdir /hadoop_class/student_data

(2)copyFromLocal

1
2
3
4
5
6
hadoop fs -copyFromLocal ~/student/tb_course.csv /hadoop_class/student_data
hadoop fs -copyFromLocal ~/student/tb_course_class.csv /hadoop_class/student_data
hadoop fs -copyFromLocal ~/student/tb_major_class.csv /hadoop_class/student_data
hadoop fs -copyFromLocal ~/student/tb_student.csv /hadoop_class/student_data
hadoop fs -copyFromLocal ~/student/tb_electives.csv /hadoop_class/student_data
hadoop fs -copyFromLocal ~/student/tb_teacher.csv /hadoop_class/student_data
1
2
hive
use username;/*我这次没用*/

(3)建表

1
2
3
4
5
6
/*course*/
CREATE EXTERNAL TABLE IF NOT EXISTS course(CODE string,NAME string,PERIOD int,CREDIT int)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
stored as textfile

load data inpath '/hadoop_class/student_data/tb_course.csv' overwrite into table course;
1
2
3
4
5
6
/*course_class*/
CREATE EXTERNAL TABLE IF NOT EXISTS course_class(CODE string,SEMESTER string,TEACHER_ID int,COURSE_CODE string)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
stored as textfile

load data inpath '/hadoop_class/student_data/tb_course_class.csv' overwrite into table course_class;
1
2
3
4
5
6
/*major_class*/
CREATE EXTERNAL TABLE IF NOT EXISTS major_class(ID int,COLLEGE string,MAJOR string,GRADE int,CLASS int,TYPE string)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
stored as textfile;

load data inpath '/hadoop_class/student_data/tb_major_class.csv' overwrite into table major_class;
1
2
3
4
5
6
/*students*/
CREATE EXTERNAL TABLE IF NOT EXISTS students(ID int,NAME string,GENDER string,BIRTHDATE string,PHONENUMBER string,MAJOR_CLASS int)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
stored as textfile;

load data inpath '/hadoop_class/student_data/tb_student.csv' overwrite into table students;
1
2
3
4
5
6
/*electives*/
CREATE EXTERNAL TABLE IF NOT EXISTS electives(COURSE_CLASS_CODE string,STUDENT_ID int,SCORE int)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
stored as textfile;

load data inpath '/hadoop_class/student_data/tb_electives.csv' overwrite into table electives;
1
2
3
4
5
6
/*teacher*/
CREATE EXTERNAL TABLE IF NOT EXISTS teacher(ID int,NAME string,GENDER string,TITLE string)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
stored as textfile;

load data inpath '/hadoop_class/student_data/tb_teacher.csv' overwrite into table teacher;

(4)题目[去表头]

1、分析数据:
a、男女生比例
b、学生整体及格率
c、每门课程的平均分,要求显示出课程的中文名字
d、有2门课不及格的学生(重修的学生)
e、课程的及格率,同时要显示老师的名字(绩效考核)

1
2
3
4
5
6
7
alter table course set TBLPROPERTIES("skip.header.line.count"="1");
alter table course_class set TBLPROPERTIES("skip.header.line.count"="1");
alter table major_class set TBLPROPERTIES("skip.header.line.count"="1");
alter table students set TBLPROPERTIES("skip.header.line.count"="1");
alter table electives set TBLPROPERTIES("skip.header.line.count"="1");
alter table teacher set TBLPROPERTIES("skip.header.line.count"="1");
/*因为第一行有表头 会影响数据*/

3.解题

a、男女生比例

方法一:

1
2
3
SELECT GENDER,count(*),100*count(*)/SUM(count(*)) over()
FROM students
GROUP BY GENDER;

方法二:

1
2
3
select t1.male/t2.female from
(select count(1) as male from students where GENDER='男') t1,
(select count(1) as female from students where GENDER='女') t2 where 1=1;

b、学生整体及格率

方法一:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
CREATE TEMPORARY TABLE temp1 AS
SELECT COURSE_CLASS_CODE, COUNT(*) AS totlsum
FROM electives
WHERE SCORE>=60
GROUP BY COURSE_CLASS_CODE;

CREATE TEMPORARY TABLE temp2 AS
SELECT COURSE_CLASS_CODE, COUNT(*) AS totlsum2
FROM electives
GROUP BY COURSE_CLASS_CODE;

SELECT temp1.COURSE_CLASS_CODE,CONCAT(CAST(temp1.totlsum*100/temp2.totlsum2 AS DECIMAL(10,2)),'%')
FROM temp1,temp2
WHERE temp1.COURSE_CLASS_CODE=temp2.COURSE_CLASS_CODE;
/*SELECT CONCAT(CAST(value AS DECIMAL(10,2)), '%') FROM table_name;
执行以上 HiveQL 查询后,返回的结果会将 value 列中的所有数值格式化为保留两位小数并添加百分号的形式*/

方法二:

1
2
3
4
5
6
7
8
9
SELECT t1.pass/t2.total 
FROM (SELECT COUNT(1) as pass
FROM electives
WHERE SCORE <60)t1,
(SELECT COUNT(1) as total
FROM electives)t2;

SELECT ROUND(SUM(IF(SCORE>=60,1,0))/COUNT(1),2) FROM electives;
/*ROUND函数 将数字四舍五入到指定的位数。*/

c、每门课程的平均分,要求显示出课程的中文名字

方法一:

1
2
3
4
5
6
7
8
9
CREATE TEMPORARY TABLE temp3 AS
SELECT COURSE_CLASS_CODE,SPLIT(COURSE_CLASS_CODE, '\\.')[0] AS code,avg(score) AS avg1
from electives
group by COURSE_CLASS_CODE;

SELECT t1.code,avg(avg1),t2.NAME
FROM temp3 t1,course t2
WHERE t1.code=t2.CODE
GROUP BY t1.code,t2.NAME;

方法二:

1
2
3
4
5
6
7
/*非严格模式*/
SELECT t2.NAME,ROUND(AVG(SCORE),2) AS avg_score
FROM electives t1,course t2,course_class t3
WHERE
t1.COURSE_CLASS_CODE = t3.CODE AND
t2.CODE = t3.COURSE_CODE
GROUP BY t2.NAME;

方法三:

1
2
3
4
SELECT t2.NAME,ROUND(AVG(SCORE),2)AS avg_score FROM electives t1
LEFT JOIN course_class t3 ON t1.COURSE_CLASS_CODE=t3.CODE
LEFT JOIN course t2 ON t2.CODE=t3.COURSE_CODE
GROUP BY t2.NAME;

方法四:

1
2
3
4
5
6
7
SELECT t1.COURSE_CLASS_CODE,t2.NAME,t1.avg_score
FROM
(SELECT COURSE_CLASS_CODE,ROUND(AVG(SCORE),2)AS avg_score FROM electives GROUP BY COURSE_CLASS_CODE)t1
LEFT JOIN course_class t3 ON t1.COURSE_CLASS_CODE=t3.CODE
LEFT JOIN course t2 ON t2.CODE=t3.COURSE_CODE;

/*FULL OUTER,LEFT ,RIGHT,CROSS JOIN 都可以*/

d、有2门课不及格的学生(重修的学生)

方法一:

1
2
3
4
5
6
7
8
9
CREATE TEMPORARY TABLE temp4 AS
SELECT STUDENT_ID,count(*) AS count1
FROM electives
WHERE SCORE<60
GROUP BY STUDENT_ID;

SELECT t1.NAME,t2.count1
FROM students t1,temp4 t2
WHERE t1.ID=t2.STUDENT_ID AND count1>=2;/*有2门课不及格的学生(重修的学生) 所以是>=2*/

方法二:

1
2
3
4
5
SELECT t1.STUDENT_ID,t2.NAME,t1.score
FROM
(SELECT STUDENT_ID,SUM(IF(SCORE<60,1,0))AS score FROM electives GROUP BY STUDENT_ID HAVING score>=2)t1
LEFT JOIN student t2 ON t2.ID=t1.STUDENT_ID;
/*姓名会显示失败,暂时没有解决*/

e、课程的及格率,同时要显示老师的名字(绩效考核)

方法一:

1
2
3
4
5
6
7
8
9
10
11
/*课程平均分绩效*/
CREATE TEMPORARY TABLE temp5 AS
SELECT t1.COURSE_CLASS_CODE,t2.TEACHER_ID AS ID1,avg(t1.score) AS avg1
FROM electives t1,course_class t2
WHERE t1.COURSE_CLASS_CODE=t2.CODE
GROUP BY t1.COURSE_CLASS_CODE,t2.TEACHER_ID
ORDER BY t2.TEACHER_ID;

SELECT t1.COURSE_CLASS_CODE,t1.ID1,t2.NAME,t1.avg1
FROM temp5 t1,teacher t2
WHERE t1.ID1=t2.ID;

方法二:

1
2
3
4
5
6
7
8
SELECT t1.COURSE_CLASS_CODE,t1.per,t3.NAME
FROM
(SELECT COURSE_CLASS_CODE,ROUND(SUM(IF(SCORE>=60,1,0))/COUNT(*),4) AS per
FROM electives
GROUP BY COURSE_CLASS_CODE)t1
LEFT JOIN course_class t2 ON t1.COURSE_CLASS_CODE=t2.CODE
LEFT JOIN teacher t3 on t2.teacher_id = ID
ORDER BY t3.NAME;

f、老师教了几门课

1
2
3
4
SELECT t1.NAME AS TEACHER_NAME, COUNT(t2.COURSE_CODE) 
FROM teacher t1
JOIN course_class t2 ON t1.ID = t2.TEACHER_ID
GROUP BY t1.NAME;

1
2
/*最后要记得*/
stop-all.sh