编程知识 cdmana.com

hive sql
create table sql
    create table student(s_id string,s_name string,s_birth string,s_sex string) row format delimited fields terminated by '\t';
    
    create table course(c_id string,c_name string,t_id string) row format delimited fields terminated by '\t';
    
    create table teacher(t_id string,t_name string) row format delimited fields terminated by '\t';
    
    create table score(s_id string,c_id string,s_score int) row format delimited fields terminated by '\t';

student.csv
    01	赵雷	1990-01-01	男
    02	钱电	1990-12-21	男
    03	孙风	1990-05-20	男
    04	李云	1990-08-06	男
    05	周梅	1991-12-01	女
    06	吴兰	1992-03-01	女
    07	郑竹	1989-07-01	女
    08	王菊	1990-01-20	女
    
course.csv
    01	语文	02
    02	数学	01
    03	英语	03
    
teacher.csv
    01	张三
    02	李四
    03	王五
  
score.csv 
    01	01	80
    01	02	90
    01	03	99
    
    02	01	70
    02	02	60
    02	03	80
    
    03	01	80
    03	02	80
    03	03	80
    
    04	01	50
    04	02	30
    04	03	20
    
    05	01	76
    05	02	87
    
    06	01	31
    06	03	34
    
    07	02	89
    07	03	98

load data into hive

    load data local inpath '/root/decent_sql_test/student.csv' into table student;
    
    load data local inpath '/root/decent_sql_test/course.csv' into table course;
    
    load data local inpath '/root/decent_sql_test/teacher.csv' into table teacher;
    
    load data local inpath '/root/decent_sql_test/score.csv' into table score;
 
– 1、查询"01"课程比"02"课程成绩高的学生的信息及课程分数:
 
    select student.*,
           s.s_score as 01_score,
           b.s_score as 02_score
    from student
             join score s
                  on student.s_id = s.s_id and s.c_id = 01
             left join score b on student.s_id = b.s_id and b.c_id = 02
    where s.s_score > b.s_score;
    
– 3、查询平均成绩大于等于60分的同学的学生编号和学生姓名和平均成绩:   
    
    select s.s_id,
           s.s_name,
           avg(s2.s_score) avg
    from student s
             left join score s2 on s.s_id = s2.s_id
    group by s.s_id, s.s_name
    having avg > 60;
    
– 4、查询平均成绩小于60分的同学的学生编号和学生姓名和平均成绩:
– (包括有成绩的和无成绩的)

    select s.s_id,  --平均成绩小于60的学生信息 
           s.s_name,
           round(avg(s2.s_score), 2) avg
    from student s
             left join score s2 on s.s_id = s2.s_id
    group by s.s_id, s.s_name
    having round(avg(s2.s_score), 2) < 60
    
    union all --拼接关联在一起
    
    select t.s_id, t.s_name, 0 as avgScore --无成绩的学生的信息
    from student t
    where s_id not in (select distinct sc.s_id from score sc);
    
– 5、查询所有同学的学生编号、学生姓名、选课总数、所有课程的总成绩:
    
    select student.s_id,
           student.s_name,
           count(distinct s.c_id), --聚合函数
           sum(s.s_score)          --聚合函数
    from student
             left join score s on student.s_id = s.s_id
    group by student.s_id,student.s_name --前面出现聚合函数,非聚合函数字段要进行分组
    
– 6、查询"李"姓老师的数量:

    --慎用count(distinct),count(distinct)容易产生倾斜问题。
    --尽量使用group by 代替 count(distinct)
    
    select count(distinct t.t_name) from teacher t where t.t_name like '李%';
    select count(t.t_name) from teacher t where t.t_name like '李%' group by t.t_name ;
    
– 7、查询学过"张三"老师授课的同学的信息:

    --思维误区,嵌套查询,
    select c.c_id from course c where c_id = (select t_id from teacher  where t_name='张三');
    --正常逻辑
    select  student.* from student
    join score s on student.s_id = s.s_id
    join course c on s.c_id = c.c_id
    join teacher t on c.t_id = t.t_id and t_name='张三';

– 8、查询没学过"张三"老师授课的同学的信息:
    select student.*
    from student
             left join (
        select s_id
        from score
                 join course c on score.c_id = c.c_id
                 join teacher t2 on c.t_id = t2.t_id and t2.t_name = '张三'
    ) t on t.s_id = student.s_id
    where t.s_id is null;
    ;
    
– 9、查询学过编号为"01"并且也学过编号为"02"的课程的同学的信息:
    select *
    from student
             join (select s_id from score where c_id = '01') t1
                  on student.s_id = t1.s_id --前面是一个整体,一个表 join 就是保留两边都有的数据
             join (select s_id from score where c_id = '02') t2
                  on student.s_id = t2.s_id;
                  
– 10、查询学过编号为"01"但是没有学过编号为"02"的课程的同学的信息:
    select student.* from student
    join (select s_id from score where c_id =1 )tmp1
        on student.s_id=tmp1.s_id
    left join (select s_id from score where c_id =2 )tmp2 --前面是一个整体,一个表此处用left join 保留左表符合条件的数据
        on student.s_id =tmp2.s_id 
    where tmp2.s_id is null;
    
– 11、查询没有学全所有课程的同学的信息:

    select s.*
    from student s
             join
             (select count(c_id) cn1 from course) t1
             left join
             (select s_id, count(c_id) cn2 from score s group by s.s_id) t on t.cn2 = t1.cn1 and s.s_id = t.s_id
    where t.s_id is null;
    
– 12、查询至少有一门课与学号为"01"的同学所学相同的同学的信息:
    select student.*
    from student
             join (select c_id from score where score.s_id = 01) temp
             join (select s_id, c_id from score) temp2
                  on temp.c_id = temp2.c_id and student.s_id = temp2.s_id
    where student.s_id not in ('01')
    group by student.s_id, s_name, s_birth, s_sex;
    
– 13、查询和"01"号的同学学习的课程完全相同的其他同学的信息:
    select student.*, tmp1.course_id
    from student
             join (select s_id, concat_ws('|', collect_set(c_id)) course_id
                   from score
                   group by s_id
                   having s_id not in (1)) tmp1
                  on student.s_id = tmp1.s_id
             join (select concat_ws('|', collect_set(c_id)) course_id2
                   from score
                   where s_id = 1) tmp2
                  on tmp1.course_id = tmp2.course_id2;

union all and union 区别
UNION去重且排序
  
UNION ALL不去重不排序
hiving的用法
  sql中having子句与where子句类似,都是表示条件的设定,它们的区别在于,
  having子句在查询过程中慢于聚合语句(sum,min,max,avg,count);
  而where子句在查询过程中则快于聚合语句(sum,min,max,avg,count)。
    
  如果你对何时应该使用WHERE,何时使用HAVING仍旧很迷惑,请遵照下面的说明:   

 WHERE语句在GROUP BY语句之前;SQL会在分组之前计算WHERE语句。   

 HAVING语句在GROUP BY语句之后;SQL会在分组之后计算HAVING语句。
hive join 用法
内关联(JOIN)
    只返回能关联上的结果。
    
左外关联(LEFT [OUTER] JOIN)
    以LEFT [OUTER] JOIN关键字前面的表作为主表,和其他表进行关联,返回记录和主表的记录数一致,关联不上的字段置为NULL。
    是否指定OUTER关键字,貌似对查询结果无影响。
    
右外关联(RIGHT [OUTER] JOIN)    
    和左外关联相反,以RIGTH [OUTER] JOIN关键词后面的表作为主表,和前面的表做关联,返回记录数和主表一致,关联不上的字段为NULL。
    是否指定OUTER关键字,貌似对查询结果无影响

全外关联(FULL [OUTER] JOIN)
    以两个表的记录为基准,返回两个表的记录去重之和,关联不上的字段为NULL。
    是否指定OUTER关键字,貌似对查询结果无影响。
    注意:FULL JOIN时候,Hive不会使用MapJoin来优化。
    
LEFT SEMI JOIN
    以LEFT SEMI JOIN关键字前面的表为主表,返回主表的KEY也在副表中的记录。


版权声明
本文为[wx5863de1532ac4]所创,转载请带上原文链接,感谢
https://blog.51cto.com/u_12469622/3257112

Scroll to Top