HiveQL table and table query

HiveQL Data Operation

1. Loading data into tables

 load data local inpath '/data/employees'
 overwrite into table employees
 partition (country='US',state='CA')

If the partition directory does not exist, this command automatically creates the partition directory and copies the files to the partition directory.
If the target table is a non-partitioned table, omit the partition clause
Local keywords denote local directories and ellipsis denotes directories in HDFS
overwrite Keyword Represents Coverage for Loading Table Data

2. Insert data into tables through query statements

hive> insert overwrite table employees05 select * from employees;  #Insert query results
hive> create table employees05 as select * from employees;  #Insert data while creating tables

# Insert data from existing tables into partitioned tables

create table staged_employees(name string,country string,state string);

insert into staged_employees 
values('zhao','China','OR'),
('qian','US','OR'),
('sun','US','CA'),
('li','US','IL'),
('zhou','US','OR'),
('wu','US','CA'),
('zheng','US','IL');

create table employees05(name string) 
partitioned by (country string,state string) 
row format delimited lines terminated by '\n';

insert overwrite table employees05
partition(country='US',state='OR')
select name from staged_employees se 
where se.country='US' and se.state='OR';

# Scan data once and process data in multiple ways (target table can be partitioned or non-partitioned)

from staged_employees se
insert overwrite table employees05
   partition(country='US',state='OR')
   select name  where se.country='US' and se.state='OR'
insert overwrite table employees05
   partition(country='US',state='CA')
   select name where se.country='US' and se.state='CA'
insert overwrite table employees05
   partition(country='US',state='IL')
   select name where se.country='US' and se.state='IL';

The results are as follows.

hive (onhive)> select * from employees05;
OK
employees05.name        employees05.country     employees05.state
sun     US      CA
wu      US      CA
li      US      IL
zheng   US      IL
qian    US      OR
zhou    US      OR

hive (onhive)> show partitions employees05;
OK
partition
country=US/state=CA
country=US/state=IL
country=US/state=OR

# Dynamic partition insertion
Assuming that there are 100 countries and states in the stage d_emloyees table, there will be 100 partitions in the employees05 table after the query below is executed successfully.

hive (onhive)> insert overwrite  table employees05
             > partition(country,state)
             > select name,country,state from staged_employees
             > where country='US';
FAILED: SemanticException [Error 10096]: Dynamic partition strict mode requires at least one static partition column. To turn this off set hive.exec.dynamic.partition.mode=nonstrict

Error reporting here: The reason is that this version (hive2.3.6) dynamic partitioning function is enabled by default and executed in a "strict" mode, in which at least one column of partition fields is required to be static. This "strict" pattern helps prevent queries from generating large numbers of partitions due to errors involved.

# Default dynamic partition properties

[root@bigdata hive_query]# hive -S -e "set" |grep 'hive.exec.*dynamic.*'
hive.exec.dynamic.partition=true  #Set to true to indicate that dynamic partitioning is enabled
hive.exec.dynamic.partition.mode=strict #Setting it to nonstrict means that all partitions are allowed to be dynamic
hive.exec.max.dynamic.partitions=1000   #Maximum number of dynamic partitions that a dynamic partition creation statement can create
hive.exec.max.dynamic.partitions.pernode=100   #The maximum number of dynamic partitions that each mapper or reducer can create

[root@bigdata hive_query]# hive -S -e "set" |grep 'hive.exec.max.created.files' 
hive.exec.max.created.files=100000    #The maximum number of files that can be created globally, and a Hadoop counter keeps track of how many files have been created.

# Dynamic and Static Partition Mixed Insertion

Static partitioning keys must appear before dynamic partitioning keys

insert overwrite  table employees05
partition(country='US',state)
select name,state from staged_employees
where country='US';

hive (onhive)> select * from employees05;
OK
employees05.name        employees05.country     employees05.state
sun     US      CA
wu      US      CA
li      US      IL
zheng   US      IL
qian    US      OR
zhou    US      OR

hive (onhive)> show partitions employees05;
OK
partition
country=US/state=CA
country=US/state=IL
country=US/state=OR

3. Export data

OBJECTIVE: To derive data from tables
# 1) Data files are formats that users need: copy folders or files
hadoop fs -cp source_path target_path
# 2) Output Custom Data Format

hive (onhive)> insert overwrite local directory '/tmp/ca_employees'
             > select * from employees05 where state='CA';

[root@bigdata hive_query]# cd /tmp/ca_employees/
[root@bigdata ca_employees]# ls
000000_0

 hive (onhive)> !ls /tmp/ca_employees;
000000_0
hive (onhive)> !cat /tmp/ca_employees/000000_0;
sunUSCA
wuUSCA

As with inserting data into a table, users can specify multiple output folder directories as follows:

--Output to HDFS
hive (onhive)> from staged_employees se
             > insert overwrite directory '/tmp/or_emp'
             >   select * where se.country='US' and se.state='OR'
             > insert overwrite directory '/tmp/ca_emp'
             >   select * where se.country='US' and se.state='CA'
             > insert overwrite directory '/tmp/IL_emp'
             >   select * where se.country='US' and se.state='IL';

hive (onhive)> dfs -ls /tmp/*emp;
Found 1 items
-rwx-wx-wx   3 root supergroup         21 2019-09-17 14:16 /tmp/IL_emp/000000_0
Found 1 items
-rwx-wx-wx   3 root supergroup         19 2019-09-17 14:16 /tmp/ca_emp/000000_0
Found 1 items
-rwx-wx-wx   3 root supergroup         22 2019-09-17 14:16 /tmp/or_emp/000000_0

--Output to local
hive (onhive)> from staged_employees se
             > insert overwrite local directory '/tmp/or_emp'
             >   select * where se.country='US' and se.state='OR'
             > insert overwrite local directory '/tmp/ca_emp'
             >   select * where se.country='US' and se.state='CA'
             > insert overwrite local directory '/tmp/IL_emp'
             >   select * where se.country='US' and se.state='IL';
 
[root@bigdata ca_employees]# ls -l /tmp |grep emp 
drwxr-xr-x  2 root  root     43 Sep 17 14:29 ca_emp
drwxr-xr-x  2 root  root     43 Sep 17 13:52 ca_employees
drwxr-xr-x  2 root  root     43 Sep 17 14:29 IL_emp
drwxr-xr-x  2 root  root     43 Sep 17 14:29 or_emp

HiveQL query

1.SELECT ...FROM

hive (onhive)> 
             > desc employees05;
OK
col_name        data_type       comment
name                    string                                      
country                 string                                      
state                   string                                      
                 
# Partition Information          
# col_name              data_type               comment             
                 
country                 string                                      
state                   string                                      
Time taken: 3.062 seconds, Fetched: 9 row(s)
--Table, field aliases and Oracle Similar, AS Keyword available or unused
hive (onhive)> select em.name n,em.country c,em.state AS s from employees05 em;
OK
n       c       s
sun     US      CA
wu      US      CA
li      US      IL
zheng   US      IL
qian    US      OR
zhou    US      OR

2.JSON-type queries (MAP, ARRAY, STRUCT)
Here's a review of hive Programming - Trivial 02

CREATE  TABLE onhive.employees(
name STRING,
sa1ary FLOAT,
subordinates ARRAY<STRING>,
deductions MAP<STRING, FLOAT>,
address STRUCT<street:STRING, city:STRING, state:STRING, zip:INT>
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','    -- Column separator
COLLECTION ITEMS TERMINATED BY '_'  --MAP STRUCT and ARRAY Separator(Data Segmentation Symbol)
MAP KEYS TERMINATED BY ':' -- MAP Medium key And value Separator
LINES TERMINATED BY '\n';  -- line Separator

Test data:

vi test.txt
John Doe,100000.0,Mary Smith_Todd Jones,Federal Taxes:0.2_State Taxes:0.05_Insurance:0.1,1 Michigan Ave._Chicago_1L_60600
Tom Smith,90000.0,Jan_Hello Ketty,Federal Taxes:0.2_State Taxes:0.05_Insurance:0.1,Guang dong._China_0.5L_60661

ps: Note that the relationships between elements in MAP,STRUCT and ARRAY can all be represented by the same character, where "".
Import data

load data local inpath '/opt/module/hive-2.3.6-bin/hive_query/test.txt' into table onhive.employees;

# Local means to find the target file in the local path, but to find it in the HDFS directory without local

hive> select * from employees;
OK
employees.name  employees.sa1ary        employees.subordinates  employees.deductions    employees.address
John Doe        100000.0        ["Mary Smith","Todd Jones"]     {"Federal Taxes":0.2,"State Taxes":0.05,"Insurance":0.1}        {"street":"1 Michigan Ave.","city":"Chicago","state":"1L","zip":60600}
Tom Smith       90000.0 ["Jan","Hello Ketty"]   {"Federal Taxes":0.2,"State Taxes":0.05,"Insurance":0.1}        {"street":"Guang dong.","city":"China","state":"0.5L","zip":60661}
Time taken: 3.839 seconds, Fetched: 2 row(s)

# Array, Map, Struct data access methods:

hive> select subordinates[0],deductions['Federal Taxes'],address.city from employees;
OK
_c0     _c1     city
Mary Smith      0.2     Chicago
Jan     0.2     China
Time taken: 0.841 seconds, Fetched: 2 row(s)

3.Limit clause
The LIMIT clause is used to limit the number of rows returned, and where rownum < 3 control is available in oracle

hive (onhive)> select em.name n,em.country c, em.state as s from employees05 em  limit 3;
OK
n       c       s
sun     US      CA
wu      US      CA
li      US      IL

4.Like and RLike
1)Like for Fuzzy Matching Similar Values
2)% denotes zero or more arbitrary characters
_ Represents a character.
3) The RLife clause is an extension of Hive, and matching conditions can be specified in a more powerful language, Java regular expressions.

hive (onhive)> select * from employees05 where state rlike '[C]';
OK
employees05.name        employees05.country     employees05.state
sun     US      CA
wu      US      CA

hive (onhive)> select * from employees05 where state like 'C%';
OK
employees05.name        employees05.country     employees05.state
sun     US      CA
wu      US      CA

5.GROUP BY
In the same way as Oracle, fields that appear in the select clause but not in group by must be in the grouping function, otherwise an error will be reported.

hive (onhive)> select country,state,count(*) from staged_employees group by country,state;
country state   _c2
China   OR      1
US      CA      2
US      IL      2
US      OR      2

6.Having
The difference between having and where:
1) where filters data for columns in tables; having filters data for columns in query results
2) where clause can't have grouping function; having can
3) group by must exist in using having

hive (onhive)> select country,state,count(*) from staged_employees group by country,state having count(*) >1;
country state   _c2
US      CA      2
US      IL      2
US      OR      2

Join

1. equivalent Join
Hive supports the usual SQL JOIN statements, but only supports equivalent connections, does not support non-equivalent connections, and does not support or in connection predicates.

hive (onhive)> select se.name from staged_employees se,employees05 e where se.name=e.name;
--Equivalent to
hive (onhive)> select se.name from staged_employees se join employees05 e on se.name=e.name;

2.left join

A left join B: The number of rows returned is based on table A on the left side of the join. The part of table B in the set retains the intersection of A and B, and the rows that do not match A are filled with NULL.

# Reloading stage_employees for test results

[root@bigdata hive_query]# vi test01.txt 
zhang,China,CA
wang,US,CA
zhao,UK,BE
lin,China,CA
lang,US,CA
wu,US,CA
li,US,IL
zheng,US,IL
qian,US,OR
zhou,US,OR

[root@bigdata hive_query]# hive -S -e "load data local inpath '/opt/module/hive-2.3.6-bin/hive_query/test01.txt' overwrite into table staged_employees";

hive (onhive)> select * from employees05;
OK
employees05.name        employees05.country     employees05.state
sun     US      CA
wu      US      CA
li      US      IL
zheng   US      IL
qian    US      OR
zhou    US      OR

hive (onhive)> select se.name,se.country,se.state,e.name,e.country,e.state from staged_employees se left join employees05 e on se.name = e.name; 
se.name se.country      se.state        e.name  e.country       e.state
zhang   China   CA      NULL    NULL    NULL
wang    US      CA      NULL    NULL    NULL
zhao    UK      BE      NULL    NULL    NULL
lin     China   CA      NULL    NULL    NULL
lang    US      CA      NULL    NULL    NULL
wu      US      CA      wu      US      CA
li      US      IL      li      US      IL
zheng   US      IL      zheng   US      IL
qian    US      OR      qian    US      OR
zhou    US      OR      zhou    US      OR

3.rigth join

A rigth join B: The number of rows returned is based on table B on the right side of the join. The part of table A in the set retains the intersection of A and B, and the rows that do not match B are filled with NULL.

hive (onhive)> select se.name,se.country,se.state,e.name,e.country,e.state from staged_employees se right join employees05 e on se.name = e.name; 
se.name se.country      se.state        e.name  e.country       e.state
NULL    NULL    NULL    sun     US      CA
wu      US      CA      wu      US      CA
li      US      IL      li      US      IL
zheng   US      IL      zheng   US      IL
qian    US      OR      qian    US      OR
zhou    US      OR      zhou    US      OR

4.full join

A full join B: Returns the union of A and B, filled with NULL rows that do not match each other in A and B

hive (onhive)> select se.name,se.country,se.state,e.name,e.country,e.state from staged_employees se right join employees05 e on se.name = e.name; 
se.name se.country      se.state        e.name  e.country       e.state
lang    US      CA      NULL    NULL    NULL
li      US      IL      li      US      IL
lin     China   CA      NULL    NULL    NULL
qian    US      OR      qian    US      OR
NULL    NULL    NULL    sun     US      CA
wang    US      CA      NULL    NULL    NULL
wu      US      CA      wu      US      CA
zhang   China   CA      NULL    NULL    NULL
zhao    UK      BE      NULL    NULL    NULL
zheng   US      IL      zheng   US      IL
zhou    US      OR      zhou    US      OR

5. Sort By for each MapReduce
Sort By: Sort by within each MapReduce, (in-area sort) is not sort for the global result set.

1) Set the number of reduce s

hive (default)> set mapreduce.job.reduces=3;

2) View the number of reduce settings

hive (default)> set mapreduce.job.reduces;

hive (onhive)> select * from staged_employees sort by name;

6. Distribute By

Distribute By: Like partition in MR, partition is used in combination with sort by.
Note that Hive requires the DISTRIBUTE BY statement to be written before the SORT BY statement.

When testing distribute by, you must assign more reduce s for processing, otherwise you can't see the effect of distribute by.

hive (onhive)> select * from staged_employees distribute by country sort by state;

7.Cluster By
When the distribute by and sorts by fields are the same, cluster by can be used.

cluster by not only has the function of distribute by, but also has the function of sort by. But sorting can only be in reverse order, and the sorting rule can not be specified as ASC or DESC.

1) The following two ways of writing are equivalent

hive (default)> select * from emp cluster by deptno;

hive (default)> select * from emp distribute by deptno sort by deptno;

Note: Partitioning by department number is not necessarily a fixed number. It can be divided into departments 20 and 30 in one partition.

hive (onhive)> select * from staged_employees distribute by country sort by country;	
staged_employees.name   staged_employees.country        staged_employees.state

lin     China   CA
zhang   China   CA
zhou    US      OR
qian    US      OR
zheng   US      IL
li      US      IL
wu      US      CA
lang    US      CA
wang    US      CA
zhao    UK      BE

Bucketing and Sampling Query

1. Bucket Table Data Storage##

Partitions are for data storage paths; buckets are for data files.

Partitioning provides a convenient way to isolate data and optimize queries. However, not all data sets can be partitioned reasonably, especially the previously mentioned doubts about determining the appropriate partition size.

Bucket splitting is another technique for splitting data into parts that are easier to manage.
(1) Creating bucket tables

create table stu_buck(id int, name string)

clustered by(id)

into 4 buckets

row format delimited fields terminated by '\t';

(2) View table structure

hive (onhive)> desc formatted stu_buck;

Num Buckets:            4

(3) Preparing data

[root@bigdata hive_query]# vi stu_buck.txt 
100     s100
101     s1o1
102     s102
103     s103
104     s104
105     s105
106     s106
110     s110
112     s112
120     s120

(4) Import Data - Failure

hive (onhive)> load data local inpath '/opt/module/hive-2.3.6-bin/hive_query/stu_buck.txt' into table stu_buck;
FAILED: SemanticException Please load into an intermediate table and use 'insert... select' to allow Hive to enforce bucketing. Load into bucketed tables are disabled for safety reasons. If you know what you are doing, please sethive.strict.checks.bucketing to false and that hive.mapred.mode is not set to 'strict' to proceed. Note that if you may get errors or incorrect results if you make a mistake while using some of the unsafe features.
--Here's a hint: For security reasons, direct to buck Table import data is not allowed by default and can be used insert..select How to import data, if you insist on importing data directly, you need to set parameters hive.strict.checks.bucketing to false；hive.mapred.mode is not set to 'strict'

Change strategy: Create common tables and import data by subquery
(1) Build a common stu table first

create table stu(id int, name string)

row format delimited fields terminated by '\t';

(2) Importing data into common stu tables

load data local inpath '/opt/module/hive-2.3.6-bin/hive_query/stu_buck.txt' into table stu;

(3) Clear data from stu_buck table

truncate table stu_buck;

select * from stu_buck;

(4) Importing data into bucket tables through sub-queries

insert into table stu_buck

select id, name from stu;

(6) You need to set an attribute, hive2.3.6 has no such parameter, and by default has opened the bucket.

set hive.enforce.bucketing=true;

(7) Query the data of bucket table

By looking directly at the table data, it can be seen that the data per barrel of this version of the hive bucket table has been arranged in reverse order.

hive (onhive)> select * from stu_buck;
OK
stu_buck.id     stu_buck.name
120     s120
112     s112
104     s104
100     s100
105     s105
101     s101
110     s110
106     s106
102     s102
103     s103

2. Bucket Sampling Query

Query the first barrel of data

hive (onhive)> select * from stu_buck tablesample(bucket 1 out of 4 on id);
OK
stu_buck.id     stu_buck.name
120     s120
112     s112
104     s104
100     s100

tablesample(bucket x out of y)
x: Represents the number of barrels from which data are drawn (1,2,3,4)
y: Represents the proportion of extracted data, which is the denominator of extracted data.
For example, there are four sub-barrels.
tablesample(bucket 1 out of 16) means to extract data from the first barrel. The ratio of the first barrel to the first barrel is (4 (barrel number)/16 (denominator)= 1/4, and a quarter of the data from the first barrel is extracted.
tablesample(bucket 2 out of 32) means to extract data from the second bucket. The ratio of the second bucket to the second bucket is (4 bucket number)/32 (denominator)=1/8, and one eighth of the first bucket is extracted.

Note: The value of x must be less than or equal to the value of y, otherwise

FAILED: SemanticException [Error 10061]: Numerator should not be bigger than denominator in sample clause for table stu_buck

3. Data Block Sampling
Hive provides another way to sample by percentage, which is based on the number of rows, according to the percentage of data blocks in the input path.

hive (default)> select * from stu tablesample(0.1 percent) ;

Tip: This sampling method may not be applicable to all file formats. In addition, the minimum sampling unit for such sampling is an HDFS data block. Therefore, if the data size of the table is less than 128M, all rows will be returned.

Posted by scross on Wed, 18 Sep 2019 04:24:15 -0700

Programmer Group