目录
hive入门到精通
hive部署
启动Hadoop
- # 启动hadoop
- start-all.sh
- # 检查hadoop进程
- jps
- # 检查各端口
- netstat -aplnt | grep java
复制代码 检查MySQL是否启动成功
- ps -aux | grep mysql
- netstat -aplnt | grep 3306
复制代码 安装hive
- # 将软件上传到 /opt/soft 目录
- # 解压hive
- tar -zxvf apache-hive-3.1.3-bin.tar.gz
- # 目录改名
- mv apache-hive-3.1.3-bin hive3
- # 进入配置文件目录
- cd /opt/soft/hive3/conf
- # 复制配置文件
- cp hive-env.sh.template hive-env.sh
- cp hive-default.xml.template hive-site.xml
- # 编辑环境配置文件
- vim hive-env.sh
- # 编辑配置文件
- vim hive-site.xml
复制代码 hive-env.sh
- # hadoop 安装路径
- export HADOOP_HOME=/opt/soft/hadoop3/
- # hive 配置文件路径
- export HIVE_CONF_DIR=/opt/soft/hive3/conf/
复制代码 hive-site.xml
注意:上面配置文件中的路径在 vi 编辑器下 全局替换- :%s@\${system:java.io.tmpdir}@/tmp/hive-logp@g
复制代码 不要使用图形化 不然每次保存后3215行都会有个  特殊字符 如果产生删除即可 具体报错信息 后面有单独的描述
上传 MySQL 连接驱动 jar 包到 hive 安装目录的lib目录下:
jar 包有两个 分别为:
- mysql-connector-java-8.0.33.jar
- protobuf-java-3.22.2.jar
删除原有的 protobuf-java-2.5.0.jar 文件
guava版本冲突
删除 hive/lib目录中的 guava-19.0.jar
拷贝hadoop/share/hadoop/common/lib目录中的 guava-27.0-jre.jar 到 hive/lib 目录- rm -f /opt/soft/hive3/lib/guava-19.0.jar
- cp -v /opt/soft/hadoop3/share/hadoop/common/lib/guava-27.0-jre.jar /opt/soft/hive3/lib
复制代码 配置环境变量
- export HIVE_HOME=/opt/soft/hive3
- export PATH=$PATH:$HIVE_HOME/bin
复制代码 初始化hive的元数据库
注意:初始初始元素中库之前 保证 hadoop 和 mysql 正常启动
- schematool -initSchema -dbType mysql
复制代码- Exception in thread "main" java.lang.RuntimeException: com.ctc.wstx.exc.WstxParsingException: Illegal character entity: expansion character (code 0x8
- at [row,col,system-id]: [3215,96,"file:/usr/local/hive/conf/hive-site.xml"]
- at org.apache.hadoop.conf.Configuration.loadResource(Configuration.java:3051)
- ...
- at org.apache.hadoop.util.RunJar.main(RunJar.java:236)
- Caused by: com.ctc.wstx.exc.WstxParsingException: Illegal character entity: expansion character (code 0x8
- at [row,col,system-id]: [3215,96,"file:/usr/local/hive/conf/hive-site.xml"]
- at com.ctc.wstx.sr.StreamScanner.constructWfcException(StreamScanner.java:621)
- ...
- at org.apache.hadoop.conf.Configuration.loadResource(Configuration.java:3034)
- ... 17 more
- 报错原因:
- hive-site.xml配置文件中,3215行(见报错记录第二行)有特殊字符
- 解决办法:
- 进入hive-site.xml文件,跳转到对应行,删除里面的  特殊字符即可
复制代码- Caused by: java.net.URISyntaxException: Relative path in absolute URI: ${system:java.io.tmpdir%7D/$%7Bsystem:user.name%7D
- at java.net.URI.checkPath(URI.java:1822)
- at java.net.URI.<init>(URI.java:745)
- at org.apache.hadoop.fs.Path.initialize(Path.java:260)
- 解决方案:将hive-site.xml配置文件的
- hive.querylog.location
- hive.exec.local.scratchdir
- hive.downloaded.resources.dir
- 三个值(原始为$标识的相对路径)写成绝对值
复制代码- # 全局替换
- :%s@\${system:java.io.tmpdir}@/tmp/hive-logp@g
复制代码 远程模式
- # 启动服务端
- hive --service metastore &
- hive --service hiveserver2 &
- # 后台运行
- nohup hive --service metastore > /dev/null 2>&1 &
- nohup hive --service hiveserver2 > /dev/null 2>&1 &
- hiveserver2 start
- nohup hiveserver2 >/dev/null 2>&1 &
复制代码- # 客户端连接
- hive
- beeline -u jdbc:hive2://spark01:10000 -n root
- beeline jdbc:hive2://spark01:10000> show databases;
复制代码 体验
- use default;
- create table person (
- id int,
- phonenum bigint,
- salary double,
- name string
- );
- create table ps (
- id int,
- phonenum bigint,
- salary double,
- name string
- );
- show tables;
- insert into person values (1001,13966668888,9999.99,"张三");
复制代码- songsong,bingbing_lili,xiao song:18_xiaoxiao song:19
- longlong,pingping_liuliu,xiao long:8_xiaoxiao long:9
复制代码- drop table person;
- create table person (
- name string,
- friends array<string>,
- childrens map<string,int>
- )
- row format delimited fields terminated by ','
- collection items terminated by '_'
- map keys terminated by ':'
- lines terminated by '\n';
复制代码- load data local inpath '/root/person.txt' into table person;
复制代码- drop table data;
- create table data (
- name string,
- amount int
- )
- row format delimited fields terminated by ','
- lines terminated by '\n';
- load data local inpath '/root/data.txt' into table data;
复制代码- select count(*) from data;
- select count(*) from data group by name;
- select name,max(t) from data group by name;
- select name,max(t) from data group by name order by max(t) ;
复制代码 编程
DDL
操作数据库
创建数据库
- -- 创建数据库不指定路径
- create database db_hive01;
- -- 创建数据库指定 hdfs 路径
- create database db_hive02 location '/db_hive02';
- -- 创建数据库附加 dbproperties
- create database db_hive03 with dbproperties ('create-date'='2023-04-17','create_author'='lihaozhe');
复制代码 查询数据库
- -- 查看所有数据库
- show databases;
- -- 模糊查看所有数据库
- -- * 代表所有
- -- | 代表或
- show databases like 'db_hive*';
- -- 查看数据库信息
- desc database db_hive03;
- -- 查看数据库详尽信息
- describe database db_hive03;
- -- 查看数据库更详尽信息
- describe database extended db_hive03;
复制代码 修改数据库
- -- 修改 dbproperties
- alter database db_hive03 SET dbproperties ('crate_data'='2023-04-18');
- -- 修改location
- alter database db_hive02 SET location '/db_hive002';
- -- 修改 owner user
- alter database database_name set owner user lhz;
复制代码 删除数据库
- -- 删除空数据库
- drop database db_hive02 restrict;
- -- 删除非空数据库
- drop database db_hive03 cascade;
复制代码 切换数据库
DML
操作数据表
普通表
临时表 temporary
外部表 external
- -- 利用 select 语句查询结果 创建一张表
- create table as select
- -- 复刻一张已经存在的表结构 但是 不包含数据
- create table like
复制代码 基本数据类型
数据类型说明定义tinyint1 byte 有符号整型smallint2 byte 有符号整型int4 byte 有符号整型bigint8 byte 有符号整型float4 byte 单精度浮点数double8 byte 双精度浮点数dicimal十进制精准数据类型varchar字符序列 需要指定最大长度 范围[1~65535]string字符串 无需指定最大长度timestamp时间binary二进制数据booleantrue falsearray一组相同数据类型的集合arraymap一组相同数据类型的键值对mapstruct由多个属性组成,每个属性都有自己的属性名和数据类型structid:int,name:string内部表
简单表- create table person (
- id int,
- phonenum bigint,
- salary dicimal,
- name string
- );
- show tables;
- insert into person values (1001,13966668888,9999.99,"张三");
复制代码 简单数据类型- create table data (
- name string,
- amount int
- )
- row format delimited fields terminated by ','
- lines terminated by '\n'
- location '/user/hive/warehouse/lihaozhe.db/data';
复制代码- # 上传文件到Hive表指定的路径
- hdfs dfs -put /root/data.csv /user/hive/warehouse/lihaozhe.db/data
复制代码 复杂数据类型- vim /root/person.txtsongsong,bingbing_lili,xiao song:18_xiaoxiao song:19
- longlong,pingping_liuliu,xiao long:8_xiaoxiao long:9
复制代码- drop table person;
- create table person (
- name string,
- friends array<string>,
- childrens map<string,int>
- )
- row format delimited fields terminated by ','
- collection items terminated by '_'
- map keys terminated by ':'
- lines terminated by '\n'; load data local inpath '/root/person.txt' into table person;
复制代码 json数据类型
json函数
get_json_object
json_tuple
json serde加载数据- --serialization 序列化
-
- --deserialization 反序列化
复制代码 - {"name":"user01","amount":"100"}
复制代码- {
- "name":"lhz",
- "friends":["lize","lanlan","manman"],
- "students":[
- "xiaohui":15000,"huixiaoxiao":18000
- ],
- "address":{
- "province":"jilin",
- "city":"liaoyuan",
- "district":"liaoyuan"
- }
- }
复制代码 -- 案例一- create table video (info string);
- load data local inpath '/root/video.log' into table video;
- select * from video limit 10;
- select count(*) from video;
复制代码- select
- get_json_object(info,'$.id') as id,
- get_json_object(info,'$.nickname') as nickname,
- get_json_object(info,'$.gold') as gold
- from video limit 5;
复制代码- select
- json_tuple(info,'id','nickname',"gold") as (id,nickname,gold)
- from video limit 5;
复制代码 案例二- create table video(
- id string ,
- uid string,
- nickname string,
- gold int,
- watchnumpv int,
- watchnumuv int,
- hots int,
- nofollower int,
- looktime int,
- smlook int ,
- follower int ,
- gifter int ,
- length int ,
- area string ,
- rating varchar(1),
- exp int ,
- type string
- )
- row format serde 'org.apache.hive.hcatalog.data.JsonSerDe';
复制代码- load data local inpath '/root/video.log' into table video;
复制代码- --把json数据,按类给格式化,
- STORED AS TEXTFILE;--文本文档的形式
复制代码- --导入数据
- load data local inpath '/root/video.log' into table video;
复制代码- 创建表的loaction'' 作用是 集群里边有对应的数据,则直接将数据加载到表中
- load data loacl inpath '' into table 是使用hive将数据从本地加载到已经建好的表中
- load data inpath '' into table 是使用hive将数据从集群里边加载到已经建好的表中
复制代码 外部表
- create external table data (
- name string,
- amount int
- )
- row format delimited fields terminated by ','
- lines terminated by '\n'
- location '/user/hive/warehouse/lihaozhe.db/data';
复制代码 部表与外部表转换
- -- 内部表转外部表
- alter table tblName set tblproperties('external'='true');
- -- 外部表转内部表
- alter table tblName set tblproperties('external'='false');
复制代码 查看表
- -- 查看表
- show tables;
- -- 查看某数据库下的某张表
- show tables in lihaozhe;
- -- 查看表
- show tables;
- -- 模糊查看数据表
- -- * 代表所有
- -- | 代表或
- show tables like 'per*';
- -- 查看基本表信息
- describe person;
- -- 查看基本表详细信息
- describe extended person;
- -- 查看基本表详细信息并格式化展示
- describe formatted person;
复制代码 修改表
- -- 修改表名称
- alter table person rename to tb_user;
- -- 添加字段 向末尾追加
- alter table tb_user add columns (gender tinyint);
- -- 修改字段名称及类型
- alter table tb_user change gender age smallint;
- -- 删除字段
复制代码 删除表
清除表
DQL
准备数据
-- 部门表 dept.csv
- 10,行政部,1700
- 20,财务部,1800
- 30,教学部,1900
- 40,销售部,1700
复制代码- hdfs dfs -mkdir -p /quiz01/dept
- hdfs dfs -put /root/dept.csv/quiz01/dept
复制代码- create external table dept(
- dept_id int comment '部门id',
- dept_name string comment '部门名称',
- location_code int comment '部门位置'
- )
- comment '部门表'
- row format delimited fields terminated by ','
- lines terminated by '\n'
- stored as textfile
- location '/quiz01/dept';
复制代码- load data local inpath '/root/dept.csv' into table dept;
复制代码员工表
- 7369,张三,研发,800.00,30
- 7499,李四,财务,1600.00,20
- 7521,王五,行政,1250.00,10
- 7566,赵六,销售,2975.00,40
- 7654,侯七,研发,1250.00.30
- 7698,马八,研发,2850.00,30
- 7782,金九,行政,2450.0,30
- 7788,银十,行政,3000.00,10
- 7839,小芳,销售,5000.00,40
- 7844,小明,销告,1500.00,40
- 7876,小李,行政,1100.00,10
- 7900,小元,讲师,950.00,30
- 7902,小海,行政,3000.00,10
- 7934,小红明,讲师,1300.00,30
复制代码- hdfs dfs -mkdir -p /quiz01/emp
- hdfs dfs -put /root/emp.csv /quiz01/emp
复制代码- create external table emp
- (
- emp_id int comment '员工ID',
- emp_name string comment '员工姓名',
- emp_job string comment '员工岗位',
- emp_salary decimal(8, 2) comment '员工薪资',
- dept_id int comment '员工隶属部门ID'
- )
- comment '员工表'
- row format delimited fields terminated by ','
- lines terminated by '\n'
- stored as textfile
- location '/quiz01/emp' ;
复制代码- load data local inpath '/root/emp.csv' into table emp;
复制代码 居民表 person.csv- hdfs dfs -mkdir -p /quiz02/person
- hdfs dfs -put /root/person.csv /quiz02/person
复制代码- CREATE external TABLE `person` (
- `id` int COMMENT '主键',
- `id_card` varchar(18) COMMENT '身份证号码',
- `mobile` varchar(11) COMMENT '中国手机号',
- `real_name` varchar(15) COMMENT '身份证姓名',
- `uuid` varchar(32) COMMENT '系统唯一身份标识符'
- )
- row format delimited fields terminated by ','
- lines terminated by '\n'
- stored as textfile
- location '/quiz02/person';
复制代码- load data local inpath '/root/person.csv' into table person;
复制代码 地区表 region.csv- hdfs dfs -mkdir -p /quiz02/region
- hdfs dfs -put /root/region.csv /quiz02/region
复制代码- CREATE external TABLE `region` (
- `parent_code` int COMMENT '当前地区的上一级地区代码',
- `region_code` int COMMENT '地区代码',
- `region_name` varchar(10) COMMENT '地区名称'
- )
- row format delimited fields terminated by ','
- lines terminated by '\n'
- stored as textfile
- location '/quiz02/region';
复制代码- load data local inpath '/root/region.csv' into table region;
复制代码 单表查询
- -- 查询所有
- select * from dept;
- -- 按照指定字段查询
- select dept_name from dept;
- -- 列别名
- select dept_name as name from dept;
- -- limit 分页查询
- select * from emp limit 5,5
- -- where 按条件查询
- select * from emp where dept_id = 10;
- -- 关系运算符
- -- = != > >= < <=
- -- in
- select * from emp where dept_id in (20,40);
- -- not in
- select * from emp where dept_id not in (20,40);
- -- like
- select * from emp where emp_name like '小%';
- -- not like
- select * from emp where emp_name not like '小%';
- -- 逻辑运算符
- -- and
- select * from emp where dept_id = 30 and emp_salary > 1000;
- -- between and
- select * from emp where dept_id = 30 and emp_salary > 1000 and emp_salary < 2000;
- select * from emp where dept_id = 30 and emp_salary between 1000 and 2000;
- --not between and
- select * from emp where dept_id = 30 and emp_salary not between 1000 and 2000;
- -- or
- select * from emp where dept_id = 10 or dept_id = 40;
- -- not !
- select * from emp where dept_id != 10;
- select * from emp where not dept_id = 10;
- -- 聚合函数
- -- count(*) count(1) count(column_name)
- select count(*) from emp;
- select count(*) as total from emp;
- -- max
- select max(emp_salary) from emp;
- -- min
- select min(emp_salary) from emp;
- -- sum
- select sum(emp_salary) from emp;
- -- avg
- select avg(emp_salary) from emp;
- -- group by 分组查询
- select dept_id, avg(emp_salary) as avg_salary from emp group by dept_id;
- -- having
- select dept_id, avg(emp_salary) as avg_salary from emp group by dept_id having avg_salary > 2000;
- -- where having
- select dept_id, avg(emp_salary) as avg_salary from emp where dept_id != 10 group by dept_id having avg_salary > 2000;
- -- order by 全局排序
- select * from emp order by dept_id desc ,emp_salary desc;
- select dept_id, max(emp_salary) from emp group by dept_id;
- select dept_id, max(emp_salary) as max_salary from emp group by dept_id order by max_salary desc;
- -- sort by (每个reduce)内部排序
- select * from emp sort by dept_id desc
- -- 查看 reduce 数量
- set mapreduce.job.reduces;
- -- 设置 reduce 数量 仅在当前连接有效 连接断开失效
- set mapreduce.job.reduces=2;
- select * from emp sort by dept_id desc;
- -- 将查询结果写入到文件
- insert overwrite local directory '/root/sort-result' select * from emp sort by dept_id desc;
- -- distribute by 分区 类似与 mapreduce 中的 partation 自定义分区
- set mapreduce.job.reduces=2;
- insert overwrite local directory '/root/distribute-result' select * from emp distribute by dept_id sort by emp_salary desc;
- -- distribute by 分区规则 根据字段的hash值 与 reduce 的数量 进行相除 余数 相同的在到一个分区
- -- hvie 要求 distribute by 语句执行 在 sort by 语句之前
- -- 执行结束之后 将 mapreduce.job.reduces 设置为 -1 不然 会影响 分区 分桶 load
- -- cluster by 只能升序 不能降序 cluster by = sort by + distribute by
- select * from emp cluster by dept_id;
复制代码 多表查询- -- 笛卡尔积
- select * from dept,emp;
- -- 避免笛卡尔积
- select * from dept,emp where dept.dept_id = emp.dept_id
- -- 等值json 内连接
- select * from dept join emp where dept.dept_id = emp.dept_id
- -- left join 左外连接
- select d.dept_id,d.dept_name,d.location_code,e.emp_id,e.emp_name,e.emp_jpb,e.emp_salary
- from dept d left join emp e where d.dept_id = e.dept_id;
- -- right join 右外连接
- select d.dept_id,d.dept_name,d.location_code,e.emp_id,e.emp_name,e.emp_jpb,e.emp_salary
- from dept d right join emp e where d.dept_id = e.dept_id;
- -- full join 满外连接
- select d.dept_id,d.dept_name,d.location_code,e.emp_id,e.emp_name,e.emp_jpb,e.emp_salary
- from dept d full join emp e where d.dept_id = e.dept_id;
- -- union 上下拼接 去重
- select * from emp where dept_id = 10 or dept_id = 40;
- select * from emp where dept_id in(10,40);
- select * from emp where dept_id = 10 union select * from emp where dept_id = 40;
- -- union all 上下拼接 不去重
- select * from emp where dept_id = 10 or dept_id = 40;
- select * from emp where dept_id in(10,40);
- select * from emp where dept_id = 10 union all select * from emp where dept_id = 40;
- -- 自关联
- select * from region where region_code='220422';
- --
复制代码 函数- # 设施本地模式
- set hive.exec.mode.local.auto=true;
- set mapperd.job.tracker=local
复制代码- -- 算术运算符
- -- + — * / % & | ~
- -- 数值运算
- -- round 四舍五入
- select round (3.3) as num;
- -- ceil 向上取整
- select ceil(3.3) as num;
- -- floor 向下取整
- select floor(3.3) as num;
- -- 字符串
- -- 截取 substr(column_name,start_index,length)
- select substr(id_card,3,3) from person;
- -- substring(column_name,start_index,length)
- select substring(id_card,3,3) from person;
- -- spilt 字符串切割
- select split('2023-04-19','-');
- -- nvl 判空 替换 null 值
- select nvl("lhz",1);
- select nvl(null,1);
- -- replace 字符串替换
- SELECT REPLACE('aaa.mysql.com','a','w');
- -- concat 字符串拼接
- select concat('slogan','-','tlbyxzcx');
- -- concat 字符串拼接
- select concat_ws('-',array('2022','04','19'));
- -- get_json_object 解析 json 字符串
- select get_json_object('[{"name":"lhz","age":41}]','$.name') as name;
- select get_json_object('[
- {"name":"lhz","age":41},
- {"name":"lz","age":14}
- ]','$.[0].name')
- -- json_tuple
- select json_tuple('{"name":"lhz","age":41}','name','age') as (name,age);
- -- 日期函数
- -- unix 时间戳
- --(1970.01.01 00:00:00 GMT UTC 时间)
- -- unix_timestamp 返回 bigint 无时区
- select unix_timestamp();-- 1681951622 时间秒数
- select unix_timestamp('1983-11-22 20:30:00','yyyy-MM-dd HH:mm:ss'); -- 438381000
- -- from_unixtime
- select from_unixtime(438381000); -- 1983-11-22 20:30:00
- -- current_date
- select current_date();
- -- current_timestamp
- select current_timestamp();
- -- year month day hours minute second
- select year('1983-01-23');
- -- datediff 两个日期相差天数(结束日期减去开始日期)
- select datediff('1983-01-23','1982-12-31')
- -- date_add 日期增加几天
- select date_add('1995-01-01',15);
- -- date_sub 日期减少几天
- select date_sub('1995-01-01',15);
- -- date_format 日期格式化
- select date_format ('1983-11-22 20:30:00','yyyy年MM月dd日 HH时mm分ss秒');
-
- -- 读取身份证获取出生日期 输出格式为 yyyy-MM-dd
- -- 1、字符串截取 2、日期格式化
- select substr(id_card,7,8) from person limit 3;
- select unix_timestamp(substr(id_card,7,8),'yyyyMMdd') from person limit 3;
- select from_unixtime(unix_timestamp(substr(id_card,7,8),'yyyyMMdd')) from person limit 3;
- select substr(from_unixtime(unix_timestamp(substr(id_card,7,8),'yyyyMMdd')),1,10) from person limit 3;
- -- 流程控制语句
- -- case when
- -- >90 a,80~90 b, 70~80 c, 60~70 的,<60 e
- select
- stu_id,course_id,
- case
- when score >= 90 then 'A'
- when score >= 80 then 'B'
- when score >= 70 then 'c'
- when score >= 60 then 'D'
- else'E'
- end as grade
- From score;
- -- if 三目运算 if(条件表达式,条件为真的返回结果,条件为假的返回结果)
- select if(1=2,'托尼','玛丽') as `发型师`
- -- 结合字符串函数 时间函数 流程控制函数 计算身份证信息
- -- 根据身份证号 判断性别 身份证号 第十七位 奇数为男性 偶数为女性
- select id_card, if(mod(substr(id_card,17,1),2) = 1,'精神小伙儿','扒蒜老妹儿') gender from person;
- -- 根据身份证号 找出所有男性信息
- select *, mod(substr(id_card,17,1),2) gender from person where mod(substr(id_card,17,1),2) = 1;
- -- 根据身份证号 计算男性人数和女性人数
- select
- if(mod(substr(id_card,17,1),2) = 1,'精神小伙儿','扒蒜老妹儿') gender ,
- count(*) gender_count
- from person group by mod(substr(id_card,17,1),2) limit 10;
- -- 根据身份证号 计算生日排序
- select
- date_format(from_unixtime( unix_timestamp(substr(id_card,7,8),'yyyyMMdd')),'yyyy-MM-dd') as `birthday`
- from person
- order by unix_timestamp(`birthday`,'yyyy-MM-dd') desc
- -- 根据身份证号 计算年龄
- -- 1、当前月份-出生月份 > 0 说明 已经过完生日 及 使用 当前年份 - 出生年份 = 年龄
- -- 2、当前月份-出生月份 < 0 说明 未过生日 及 使用 当前年份 - 出生年份 -1 = 年龄
- -- 3、当前月份-出生月份 = 0
- -- 3.1、当前日-出生日 > 0 说明 已经过完生日 及 使用 当前年份 - 出生年份 = 年龄
- -- 3.2、当前日-出生日 < 0 说明 未过生日 及 使用 当前年份 - 出生年份 -1 = 年龄
- -- 3.3、当前日-出生日 = 0 说明 生日视作过完了 及 使用 当前年份 - 出生年份 = 年龄
- select if(month(`current_date`()) - month(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) > 0,
- year(`current_date`()) - year(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))),
- if(month(`current_date`()) - month(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) < 0,
- year(`current_date`()) - year(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) - 1,
- if(day(`current_date`()) - day(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) > 0,
- year(`current_date`()) - year(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))),
- if(day(`current_date`()) - day(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) < 0,
- year(`current_date`()) - year(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) - 1,
- year(`current_date`()) - year(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd')))
- )
- )
- )
- ) as `age`
- from person;
-
- -- 集合函数
- -- size 集合中元素的数量
- select size(array(0,1,2,3,4,5));
- -- array 声明一个集合 数组
- select array(0,1,2,3,4,5) as nums;
- -- array_contains 判断array中是否包含某元素 返回值是 布尔值 tury false
- select array_contains(array(0,1,2,3,4,5),3) as num;
- -- sort_array 排序 目前只能升序
- select sort_array(array(0,1,2,3,4,5));
- -- struct 声明结构体属性名称
- select struct('real_name','lhz','age','41');
- -- {"col1":"real_name","col2":"lhz","col3":"age","col4":"41"}
- -- named_struct 声明结构体属性和值
- select named_struct('real_name','lhz','age','41');
- -- {"real_name":"lhz","age":"41"}
- -- 集合函数
- -- map
- select map('xz',1000,'js',800);
- -- {"xz":1000,"js":800}
- -- map_keys 返回 map 中所有的 key
- select map_keys(map('xz',1000,'js',800));
- -- map_values 返回 map 中所有的 value
- select map_values(map('xz',1000,'js',800));
- -- if 三目运算
- select if(条件表达式,条件为真表达式,条件为假表达式)
复制代码 练习
数据:
学生表
讲师表
课程表
分数表
学生表 student.csv- hdfs dfs -mkdir -p /quiz03/student
- hdfs dfs -put /root/student.csv /quiz03/student
复制代码- load data local inpath '/root/student.csv' into table student;
复制代码 课程表 course.csv- hdfs dfs -mkdir -p /quiz03/course
- hdfs dfs -put /root/course.csv /quiz03/course
复制代码- load data local inpath '/root/course.csv' into table course;
复制代码 分数表 score.csv- hdfs dfs -mkdir -p /quiz03/score
- hdfs dfs -put /root/score.csv /quiz03/score
复制代码- load data local inpath '/root/course.csv' into table course;
复制代码- -- 学生表
- create external table student (
- stu_id string comment '学生ID',
- stu_name string comment '学生姓名',
- birthday string comment '出生年月',
- gender string comment '学生性别'
- )
- row format delimited fields terminated by ','
- lines terminated by '\n'
- stored as textfile
- location '/quiz03/student';
- --教师表
- create external table teacher (
- tea_id string comment '课程ID',
- tea_name string comment '教师名称'
- )
- row format delimited fields terminated by ','
- lines terminated by '\n'
- stored as textfile
- location '/quiz03/teacher';
- --课程表
- create external table course (
- coures_id string comment '课程ID',
- coures_name string comment '课程名称',
- tea_id string comment '讲师ID'
- )
- row format delimited fields terminated by ','
- lines terminated by '\n'
- stored as textfile
- location '/quiz03/course';
- --成绩表
- create external table score (
- stu_id string comment '学生ID',
- coures_id string comment '课程ID',
- score string comment '成绩'
- )
- row format delimited fields terminated by ','
- lines terminated by '\n'
- stored as textfile
- location '/quiz03/score';
复制代码 综合练习
- -- 查询所有学生信息
- select * from student;
- -- 查询周姓学生信息
- select * from student where stu_name like '周%';
- -- 查询周姓学生数量
- select count(*) as zhou_count from student where stu_name like '周%';
- -- 查询 学生ID 004 的分数 超过 85 的成绩
- select * from score where stu_id = 004 and score > 85;
- -- 查询 学生ID 004 的分数 超过 85 的成绩
- select * from score where stu_id = 004 and score > 85;
- -- 查询 学生程ID 004 的成绩降序
- select * from score where stu_id = 01 order by score desc;
- -- 查询 数学成绩不及格学生及其对应的成绩 安装学生ID排序
- select stu.stu_id, stu.stu_name,s.score
- from student stu join course c join score s
- on stu.stu_id = s.stu_id and c.course_id = s.course_id
- and c.course_name = '数学' and s.score < 60
- order by stu.stu_id;
-
- -- 查询男女生人数
- select gender,count(*) as gender_count from student group by gender;
- -- 查询编号为 02 的课程平均成绩
- select avg(score) from score where course_id = 02;
- -- 查询每科课程平均成绩
- select avg(score) from score group by course_id;
- -- 查询参加考试学生人数
- select count(distinct stu_id) as stu_count from score;
- -- 查询每科有多少学生参加考试
- select course_id,count(*) as stu_count from score group by course_id;
- -- 查询未参加考试的学生信息
- select stu_id,stu_name from student where stu_id not in (
- select distinct stu.stu_id from student stu left join course c left join score s
- on stu.stu_id = s.stu_id and c.course_id = s.course_id
- order by stu.stu_id
- )
- -- 查询平均成绩及格(60分)的学生的平均成绩
- select stu_id, avg(score) avg_score
- from score
- group by stu_id
- having avg_score >= 60;
- -- 查询选修至少 4 门 以上课程学生的学号
- select stu_id,count(course_id) course_count from score
- group by stu_id
- having course_count >= 4;
- -- 查询姓氏相同学生名单 并且同姓人数大于 2 的姓氏
- select first_name ,count(*) first_name_count from (
- select stu_id,stu_name,substr(stu_name,1,1) as first_name
- from student
- ) ts
- group by ts.first_name
- having first_name_count > 1;
- -- 查询每门功课的学生的平均成绩 按照平均成绩升序 平均成绩相同按照课程编号降序
- select course_id, avg(score) avg_score
- from score
- group by course_id
- order by avg_score,course_id desc;
-
- -- 统计参加考试人数大于等于 15 的学科
- select course_id,count(*) as stu_count from score group by course_id having stu_count > 15;
- -- 查询学生总成绩并按照总成绩降序排序
- select stu_id, sum(score) sum_score
- from score
- group by stu_id
- order by sum_score desc;
-
- -- 按照指定格式显示 stu_id 语文 数学 英语 选课数 平均成绩
- select
- s.stu_id,
- sum(`if`(c.course_name='语文',score,0)) as `语文`,
- sum(`if`(c.course_name='数学',score,0)) as `数学`,
- sum(`if`(c.course_name='英语',score,0)) as `英语`,
- count(s.course_id) as `选课数`,
- avg(s.score) as `平均成绩`
- from course c left join score s
- on c.course_id = s.course_id
- group by s.stu_id
- order by `平均成绩` desc;
-
- -- 查询一共参加了三门功课且其中一门为语文的学生id 和 姓名
- select s.stu_id,stu_name from
- (select t1.stu_id ,count(t1.course_id) course_count from
- (select stu_id,course_id from score
- where stu_id in ( select stu_id from score where course_id = "01")
- ) t1 group by t1.stu_id having course_count >=3
- ) t2 join student s on t2.stu_id = s.stu_id;
- -- 分解
- -- 查询该学生的姓名
- select s.stu_id,stu_name from
- -- 成绩表中学习科目数量 >=3 科的学生
- (select t1.stu_id ,count(t1.course_id) course_count from
- -- 报名了语文的学生还报名了那些学科
- (select stu_id,course_id from score
- where stu_id in (
- -- 查询报名了语文的学生ID
- select stu_id from score where course_id = "01"
- )
- ) t1 group by t1.stu_id having course_count >=3
- ) t2 join student s on t2.stu_id = s.stu_id;
- -- 查询两门以上的课程不及格学生的学号及其平均成绩
- -- 1、先按照学生分组 过滤出成绩低于60的数量 大于1
- -- 2、计算所有学生的平均成绩
- -- 3、两个子查询相互join
- select t1.stu_id,t2.avg_score from
- (select stu_id, sum(if(score < 60, 1, 0)) as result from score group by stu_id having result > 1) t1
- left join
- (select stu_id,avg(score) as avg_score from score group by stu_id) t2 on t1.stu_id =t2.stu_id;
- -- 查询所有学生的学号、姓名、选课数、总成绩
- select
- stu.stu_id,stu.stu_name,count(s.course_id) count_course ,nvl(sum(s.score),0) total_score
- from student stu left join score s on stu.stu_id = s.stu_id
- group by stu.stu_id, stu.stu_name order by stu.stu_id;
- -- 平均成绩大于 85 的所有学生的学号、姓名、平均成绩
- select
- stu.stu_id,stu.stu_name ,nvl(avg(s.score),0) as `avg_score`
- from student stu left join score s on stu.stu_id = s.stu_id
- group by stu.stu_id, stu.stu_name having nvl(avg(s.score),0) > 85 order by stu.stu_id
- -- 查询学生的选课情况:学号,姓名,课程号,课程名称
- select student.stu_id,student.stu_name,c.course_id,c.course_name from student
- right join score s on student.stu_id = s.stu_id
- left join course c on s.course_id = c.course_id
- -- 查询学生的没有选课情况:学号,姓名
- select stu_id,stu_name from
- (
- select student.stu_id,student.stu_name, s.course_id from student
- left join score s on student.stu_id = s.stu_id
- left join course c on s.course_id = c.course_id
- ) t where course_id is null
- -- 查询出每门课程的及格人数和不及格人数
- select c.course_id,course_name,pass,fail
- from course c join
- (
- select
- course_id,sum(if(score >= 60,1,0)) as `pass`, sum(if(score < 60,1,0)) as `fail`
- from score group by course_id
- ) t on c.course_id = t.course_id
- -- 查询课程编号为03且课程成绩在80分以上的学生的学号和姓名及课程信息
- select t1.stu_id,s.stu_name,t1.course_id,c.course_name,t1.score from
- (select * from score where course_id = '03' and score > 80) t1
- left join student s on s.stu_id = t1.stu_id
- left join course c on t1.course_id = c.course_id
- -- 查询语文成绩低于平均分数的学生是谁,教师是谁
- select t3.stu_id,t3.stu_name,t3.`avg_score`,t.tea_name from
- (select t2.stu_id,t2.`avg_score`,s.stu_name,t2.course_id,c.tea_id from
- (select t1.stu_id,t1.course_id,t1.`avg_score` from
- (select stu_id,s.course_id, avg(score) as `avg_score` from score s right join
- (select course_id from course where course_name = '语文') t1 on t1.course_id = s.course_id
- group by stu_id,s.course_id) t1
- where t1.`avg_score` < (select avg(score) as `avg_score` from score s right join (select course_id from course where course_name = '语文') t1 on t1.course_id = s.course_id)
- ) t2 left join student s on t2.stu_id = s.stu_id
- left join course c on t2.course_id = c.course_id
- )t3 left join teacher t on t3.tea_id = t.tea_id;
- -- 查询所有学生总成绩和平均成绩,
- -- 且他们的总成绩低于平均成绩的有多少个人,
- -- 高于平均成绩的有多少人,
- -- 低于平均成绩的男生和女生分别有多少人,
- -- 且他们的任课老师是谁。
- -- 统计各科成绩各分数段人数:课程编号,课程名称,[100-85],[85-70],[70-60],[0-60]及所占百分比
- -- 方法一
- select course_id,
- concat(round((sum(`if`(score >= 85, 1, 0)) / count(*)) * 100, 2), '%') as `a`,
- concat(round((sum(`if`(score < 85, `if`(score >= 70, 1, 0), 0)) / count(*)) * 100, 2), '%') as `b`,
- concat(round((sum(`if`(score < 70, `if`(score >= 60, 1, 0), 0)) / count(*)) * 100, 2), '%') as `c`,
- concat(round((sum(`if`(score < 60, 1, 0)) / count(*)) * 100, 2), '%') as `d`
- from score group by course_id;
- -- 方法二
- select course_id,
- concat(round((sum(`if`(score >= 85, 1, 0)) / count(*)) * 100, 2), '%') as `a`,
- concat(round((sum(`if`(score between 70 and 84, 1, 0)) / count(*)) * 100, 2), '%') as `b`,
- concat(round((sum(`if`(score between 60 and 74, 1, 0)) / count(*)) * 100, 2), '%') as `c`,
- concat(round((sum(`if`(score < 60, 1, 0)) / count(*)) * 100, 2), '%') as `d`
- from score group by course_id;
- -- 查询各科成绩最高分、最低分和平均分,以如下形式显示:
- -- 课程ID,课程name,最高分,最低分,平均分,及格率,中等率,优良率,优秀率
- -- 及格为>=60,中等为:70-80,优良为:80-90,优秀为:>=90
- select c.course_id as `课程ID`,
- c.course_name as `课程name`,
- max(score) as `最高分`,
- min(score) as `最低分`,
- round(avg(score), 2) as `平均分`,
- concat(round(sum(`if`(score >= 60, 1, 0)) / count(*) * 100, 2), '%') as `及格率`,
- concat(round(sum(if(score between 70 and 79, 1, 0)) / count(*) * 100, 2), '%') as `中等率`,
- concat(round(sum(if(score between 80 and 89, 1, 0)) / count(*) * 100, 2), '%') as `优良率`,
- concat(round(sum(`if`(score >= 90, 1, 0)) / count(*) * 100, 2), '%') as `优秀率`
- from course c left join score s on c.course_id = s.course_id
- group by c.course_id, c.course_name;
- -- 查询每门课程的教师学生有谁,男生和女生的比例是多少,
- select t1.course_id,t1.gender,concat(round((t1.count_gender / t2.count_course_student) * 100,2),'%') as `proportion` from
- (
- select c.course_id, stu.gender,count(stu.gender) as `count_gender`
- from course c left join score s on c.course_id = s.course_id left join student stu on s.stu_id = stu.stu_id
- group by c.course_id, stu.gender
- ) t1
- join
- (
- select c.course_id, count(*) as `count_course_student`
- from course c left join score s on c.course_id = s.course_id left join student stu on s.stu_id = stu.stu_id
- group by c.course_id
- ) t2 on t1.course_id = t2.course_id
- join score s on t1.course_id = s.course_id
- -- 且他们的每门学科的成绩是男生比较优一些还是女生比较优一些,并且每门课程的最高分是谁。
- select s.course_id,max(s.score) as `max_score`,min(s.score) as `min_score` from course join score s on course.course_id = s.course_id group by s.course_id
- -- 课程编号为"01"且课程分数小于60,按分数降序排列的学生信息
- select s.stu_id, stu.stu_name, stu.birthday, stu.gender,s.score
- from score s join student stu on s.stu_id = stu.stu_id
- where s.score < 60 order by s.score desc
- -- 查询所有课程成绩在70分以上的学生的姓名、课程名称和分数,按分数升序
- select stu.stu_name, c.course_name, s2.score
- from student stu join
- (select s.stu_id, sum(`if`(s.score >= 70, 0, 1)) as `is_ok` from score s group by s.stu_id having is_ok = 0) t1
- on stu.stu_id = t1.stu_id left join score s2 on stu.stu_id = s2.stu_id left join course c on s2.course_id = c.course_id
- order by s2.score
- -- 查询某学生不同课程的成绩相同的学生编号、课程编号、学生成绩
- select s1.stu_id,collect_list(s1.course_id) as course_id,collect_set(s1.score) as score
- from score s1 join score s2 on s1.stu_id = s2.stu_id
- and s1.course_id != s2.course_id
- and s1.score == s2.score
- group by s1.stu_id
复制代码- -- 查询语文成绩低于平均分数的学生是谁,教师是谁
- select stu.stu_name,tea_name from student stu left join score s left join course c left join teacher t where c.course_nam = "语文" and s.
复制代码- -- 结合字符串函数 时间函数 流程控制函数 计算身份证信息
- -- 根据身份证号 判断性别 身份证号 第十七位 奇数为男性 偶数为女性
- select id_card, if(mod(substr(id_card,17,1),2) = 1,'精神小伙儿','扒蒜老妹儿') gender from person;
- -- 根据身份证号 找出所有男性信息
- select *, mod(substr(id_card,17,1),2) gender from person where mod(substr(id_card,17,1),2) = 1;
- -- 根据身份证号 计算男性人数和女性人数
- select
- if(mod(substr(id_card,17,1),2) = 1,'精神小伙儿','扒蒜老妹儿') gender ,
- count(*) gender_count
- from person group by mod(substr(id_card,17,1),2) limit 10;
- -- 根据身份证号 计算生日排序
- select
- date_format(from_unixtime( unix_timestamp(substr(id_card,7,8),'yyyyMMdd')),'yyyy-MM-dd') as `birthday`
- from person
- order by unix_timestamp(`birthday`,'yyyy-MM-dd') desc
- -- 根据身份证号 计算年龄
- -- 1、当前月份-出生月份 > 0 说明 已经过完生日 及 使用 当前年份 - 出生年份 = 年龄
- -- 2、当前月份-出生月份 < 0 说明 未过生日 及 使用 当前年份 - 出生年份 -1 = 年龄
- -- 3、当前月份-出生月份 = 0
- -- 3.1、当前日-出生日 > 0 说明 已经过完生日 及 使用 当前年份 - 出生年份 = 年龄
- -- 3.2、当前日-出生日 < 0 说明 未过生日 及 使用 当前年份 - 出生年份 -1 = 年龄
- -- 3.3、当前日-出生日 = 0 说明 生日视作过完了 及 使用 当前年份 - 出生年份 = 年龄
- select if(month(`current_date`()) - month(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) > 0,
- year(`current_date`()) - year(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))),
- if(month(`current_date`()) - month(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) < 0,
- year(`current_date`()) - year(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) - 1,
- if(day(`current_date`()) - day(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) > 0,
- year(`current_date`()) - year(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))),
- if(day(`current_date`()) - day(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) < 0,
- year(`current_date`()) - year(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) - 1,
- year(`current_date`()) - year(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd')))
- )
- )
- )
- ) as `age`
- from person;
复制代码 高级聚合函数
分组排序取TopN
- -- row_number() over () 连续序号
- -- over()里头的分组以及排序的执行晚于 where 、group by、order by 的执行。
- -- partition by 分区
- select *,row_number() over () as `num` from score;
- -- 查询各科成绩前三名的学生
- SELECT a.stu_id,a.course_id,a.score
- FROM score a
- LEFT JOIN score b ON a.course_id = b.course_id
- AND a.score <= b.score
- GROUP BY a.stu_id,a.course_id,a.score
- HAVING COUNT( b.stu_id ) <= 3
- ORDER BY a.course_id,a.score DESC;
- SELECT S1.course_id,s1.stu_id,s1.score FROM score s1
- WHERE (
- SELECT COUNT(*) FROM score s2
- WHERE s2.course_id=s1.course_id AND s2.score > s1.score
- ) < 3 ORDER BY s1.course_id,s1.score DESC;
- select * from
- (
- select course_id,stu_id,score,
- row_number() over (partition by course_id order by score desc ) as `num`
- from score
- ) t where t.num <= 3;
- -- rank() over () 排名 跳跃排序 序号不是连续的
- select * from
- (
- select course_id,stu_id,score,
- rank() over (partition by course_id order by score desc ) as `ranking`
- from score
- ) t;
- -- dense_rank() over () 排名 连续排序
- select * from
- (
- select course_id,stu_id,score,
- dense_rank() over (partition by course_id order by score desc ) as `ranking`
- from score
- ) t;
复制代码 分区表
模拟数据
身份证前六位
身份证前六位
- -- 行转列
- -- collect_list 行转列 有序可重复 结果是个集合
- select collect_list(emp_job) as `job` from employee;
- -- collect_set 行转列 过滤重复 结果是个集合
- select collect_list(emp_job) as `job` from employee;
- -- concat_ws 把集合转字符串
- concat_ws('分隔符',集合)
- select concat_ws(',',collect_set(emp_job)) as `job` from emp;
- -- split 把字符串转为集合
- concat_ws(字符串,'分隔符')
- select split(concat_ws(',',collect_set(emp_job)))as `job` from emp;
复制代码 pom.xml
- hdfs dfs -mkdir -p /quiz04/employee
- hdfs dfs -put /root/employee.csv /quiz04/employee
复制代码 工具类
- create external table employee(
- name string comment '姓名',
- sex string comment '性别',
- birthday string comment '出生年月',
- hiredate string comment '入职日期',
- job string comment '岗位',
- salary int comment '薪资',
- bonus int comment '奖金',
- friends array<string> comment '朋友',
- children map<string,int> comment '孩子'
- )
- row format delimited fields terminated by ','
- collection items terminated by '_'
- map keys terminated by ':'
- lines terminated by '\n'
- stored as textfile
- location '/quiz04/employee';
复制代码 创建数据库
- load data local inpath '/root/employee.csv' into table employee;
复制代码- -- explode
- select explode(array('java','python','scala','go')) as course;
- select explode(map('name','李昊哲','gender','1')) as (key,value);
- -- posexplode
- select posexplode(array('java','python','scala','go')) as (pos,course);
- -- inline
- select inline(array(named_struct('id',1,'name','李昊哲','gender','1'),
- named_struct('id',2,'name','李哲','gender','0'),
- named_struct('id',3,'name','李大宝','gender','1'))) as (id,name,gender);
- -- lateral view
- select * from employee lateral view explode(friends) t as friend;
- select e.name,e.friends,t1.friend from employee e lateral view explode(friends) t1 as `friend`;
- select * from employee e lateral view explode(children) t1 as `children_name`,`children_friend_count`;
- select e.name,e.children,t1.children_name,t1.nvl(t2.children_friend_count,0) from employee e
- lateral view explode(children) t1 as `children_name`,`children_friend_count`;
- select e.name,e.friends,e.children,t1.friend,t2.children_name,nvl(t2.children_friend_count,0) from employee e
- lateral view explode(friends) t1 as `friend`
- lateral view explode(children) t2 as `children_name`,`children_friend_count`;
- -- lateral view outer
复制代码 内部分区表
内部分区表
- hdfs dfs -mkdir -p /quiz04/movie
- hdfs dfs -put /root/movie.txt /quiz04/movie
复制代码 导入数据
- create external table movie(
- name string comment '电影名称',
- category string comment '电影分类'
- )
- row format delimited fields terminated by '-'
- lines terminated by '\n'
- stored as textfile
- location '/quiz04/movie';
复制代码 外部分区表
创建外部分区表关联目录
- load data local inpath '/root/employee.csv' into table employee;/partition_2
复制代码 创建外部分区表
- -- 根据上述电影信息表,统计各分类的电影数量
- select cate,count(name) as `quantity` from movie
- lateral view explode(split(category,',')) tmp as cate
- group by cate;
复制代码 导入数据
- -- 统计岗位数量
- select count(distinct emp_job) from emp;
- select count(*) from (select emp_job from emp group by emp_job) t;
复制代码 多重内部分区表
创建内部多重内部分区表
- Function(arg1,..., argn) OVER ([PARTITION BY <...>] [ORDER BY <....>] [<window_expression>])
- -- 其中Function(arg1,..., argn) 可以是下面分类中的任意一个
- -- 聚合函数:比如sum max min avg count等
- -- 排序函数:比如row_number rank dense_rank等
- -- 分析函数:比如lead lag first_value last_value等
- -- OVER [PARTITION BY <...>] 类似于group by 用于指定分组 每个分组你可以把它叫做窗口
- -- 如果没有PARTITION BY 那么整张表的所有行就是一组
- -- [ORDER BY <....>] 用于指定每个分组内的数据排序规则 支持ASC、DESC
- -- [<window_expression>] 用于指定每个窗口中 操作的数据范围 默认是窗口中所有行
复制代码 导入数据
- hdfs dfs -mkdir /quiz04/order
- hdfs dfs -put /root/order.csv /quiz04/order
复制代码 多重外部分区表
创建多重外部分区表关联目录
- load data local inpath '/root/employee.csv' into table employee;/partition_4
复制代码 创建多重外部分区表
- load data local inpath '/root/order.csv' into table order;
复制代码- -- 统计每个用户截至每次下单的累计下单总额
- select *,
- sum(order_amount) over
- (partition by user_id order by order_date rows between unbounded preceding and current row) `sum_order_amount`
- from `order`
- -- 统计每个用户截至每次下单的当月累积下单总额
- select *,
- sum(order_amount)
- over(partition by user_id,substr(order_date,1,7) order by order_date
- rows between unbounded preceding and current row) `sum_order_amount`
- from `order`
复制代码 动态分区
- -- 统计每个用户每次下单距离上次下单相隔的天数(首次下单按0天算)
- select user_id,user_name,order_id,order_date,datediff(order_date,last_order_date) `diff_date` from
- (select *,
- lag(order_date,1,order_date) over (partition by user_id order by order_date) `last_order_date` from order`) t
复制代码- -- 查询所有下单记录以及每个用户的每个下单记录所在月份的首/末次下单日期
- select *,
- first_value(order_date) over(partition by user_id,substr(order_date,1,7) order by order_date) `first_date`,
- last_value(order_date) over (partition by user_id,substr(order_date,1,7) order by order_date
- rows between unbounded preceding and unbounded following) `last_date`
- from `order`
复制代码- -- 为每个用户的所有下单记录按照订单金额进行排名
复制代码- hdfs dfs -mkdir -p /tmall/user
- hdfs dfs -put /root/user.csv /tmall/user
复制代码- create external table `user` (
- `user_id` string COMMENT '用户id',
- `gender` string COMMENT '性别',
- `birthday` string COMMENT '生日'
- ) COMMENT '用户信息表'
- row format delimited fields terminated by ','
- lines terminated by '\n'
- stored as textfile
- location '/tmall/user';
复制代码- load data local inpath '/root/user.csv' into table user;
复制代码 分桶
数据抽样 提高join查询效率
- 创建普通表并导入数据
- 开启分桶
- 查询普通表将,将查询结果插入桶
- 从桶中查询数据
创建普通表并导入数据
- hdfs dfs -mkdir -p /tmall/sku
- hdfs dfs -put /root/sku.csv /tmall/sku
复制代码- create external table sku (
- `sku_id` string COMMENT '商品id',
- `name` string COMMENT '商品名称',
- `category_id` string COMMENT '所属分类id',
- `from_date` string COMMENT '上架日期',
- `price` double COMMENT '商品单价'
- ) COMMENT '商品信息表'
- row format delimited fields terminated by ','
- lines terminated by '\n'
- stored as textfile
- location '/tmall/sku';
复制代码 开启分桶
- load data local inpath '/root/sku.csv' into table sku;
复制代码 创建桶表
- hdfs dfs -mkdir -p /tmall/category
- hdfs dfs -put /root/category.csv /tmall/category
复制代码 载入数据到桶表
- load data local inpath '/root/sku.csv' into table sku;insert into table bucket_tb select id from bucket_source where id is not null;
复制代码- load data local inpath '/root/category.csv' into table category;
复制代码 视图
- hdfs dfs -mkdir -p /tmall/order
- hdfs dfs -put /root/order.csv /tmall/order
复制代码 存储与压缩
文件格式
行式存储与列式存储
hive表中的数据选择一个合适的文件格式,对于高性能查询是比较有益的
行式存储:text file,sequence file
列式存储:ORC、Parquet
text file:
hive默认采用text file 文件存储格式;
- create external table `order` (
- `order_id` string COMMENT '订单id',
- `user_id` string COMMENT '用户id',
- `create_date` string COMMENT '下单日期',
- `total_amount` decimal(16, 2) COMMENT '订单总金额'
- ) COMMENT '订单信息表'
- row format delimited fields terminated by ','
- lines terminated by '\n'
- stored as textfile
- location '/tmall/order';
复制代码- load data local inpath '/root/order.csv' into table order;
复制代码- hdfs dfs -mkdir -p /tmall/order_detail
- hdfs dfs -put /root/order_detail.csv /tmall/order_detail
复制代码 sequence file
sequence file 文件 是Hadoop用来存储二进制形式的的 key : value 键值对而设计的一种平面文件 flatmap
- create external table order_detail (
- `order_detail_id` string COMMENT '订单明细id',
- `order_id` string COMMENT '订单id',
- `sku_id` string COMMENT '商品id',
- `create_date` string COMMENT '下单日期',
- `price` decimal(16, 2) COMMENT '下单时的商品单价',
- `sku_num` int COMMENT '下单商品件数'
- ) COMMENT '订单明细表'
- row format delimited fields terminated by ','
- lines terminated by '\n'
- stored as textfile
- location '/tmall/order_detail';
复制代码- load data local inpath '/root/order_detail.csv' into table order_detail;
复制代码- hdfs dfs -mkdir -p /tmall/user_login
- hdfs dfs -put /root/user_login.csv /tmall/user_login
复制代码 ORC
- create external table user_login (
- `user_id` string comment '用户id',
- `ip_address` string comment 'ip地址',
- `login_ts` string comment '登录时间',
- `logout_ts` string comment '登出时间'
- ) COMMENT '登录明细表'
- row format delimited fields terminated by ','
- lines terminated by '\n'
- stored as textfile
- location '/tmall/user_login';
复制代码- load data local inpath '/root/user_login.csv' into table user_login;
复制代码- hdfs dfs -mkdir -p /tmall/sku_price_modify_detail
- hdfs dfs -put /root/sku_price_modify_detail.csv /tmall/sku_price_modify_detail
复制代码- create external table sku_price_modify_detail (
- `sku_id` string comment '商品id',
- `new_price` decimal(16, 2) comment '更改后的价格',
- `change_date` string comment '变动日期'
- ) COMMENT '商品价格变更明细表'
- row format delimited fields terminated by ','
- lines terminated by '\n'
- stored as textfile
- location '/tmall/sku_price_modify_detail';
复制代码 Parquet
- load data local inpath '/root/sku_price_modify_detail.csv' into table sku_price_modify_detail;
复制代码- create external table sku_price_modify_detail (
- `sku_id` string comment '商品id',
- `new_price` decimal(16, 2) comment '更改后的价格',
- `change_date` string comment '变动日期'
- ) COMMENT '商品价格变更明细表'
- row format delimited fields terminated by ','
- lines terminated by '\n'
- stored as textfile
- location '/tmall/sku_price_modify_detail';
复制代码 rcfile
- create external table delivery (
- `delivery_id` string comment '配送单id',
- `order_id` string comment '订单id',
- `user_id` string comment '用户id',
- `order_date` string comment '下单日期',
- `custom_date` string comment '期望配送日期'
- ) COMMENT '配送信息表'
- row format delimited fields terminated by ','
- lines terminated by '\n'
- stored as textfile
- location '/tmall/delivery';
复制代码- load data local inpath '/root/delivery.csv' into table delivery;
复制代码- hdfs dfs -mkdir -p /tmall/friendship
- hdfs dfs -put /root/friendship.csv /tmall/friendship
复制代码 免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作! |