Hive
Hive 資料型態
Quick Tips
Query ranged by date interval with Between ... AND clause
SELECT
api_logtime
FROM
all_weblog
WHERE log_mon_i='2017-06' AND
from_unixtime(unix_timestamp(api_logtime),'yyyy-MM-dd HH:mm:ss') BETWEEN '2017-06-01 15:33:22' AND '2017-06-01 15:35:00'
limit 100
Update
FAILED: SemanticException [Error 10294]: Attempt to do update or delete using transaction manager that does not support these operations in hive-0.14.0
INSERT...VALUES / UPDATE / DELETE is available starting in Hive 0.14. INSERT...VALUES / UPDATE / DELETE can only be performed on tables that support ACID
add hive-site.xml setting as below, then restart hive
hive.enforce.bucketing – true
hive.exec.dynamic.partition.mode – nonstrict
hive.txn.manager –org.apache.hadoop.hive.ql.lockmgr.DbTxnManager
hive.compactor.initiator.on – true
hive.compactor.worker.threads – 1
Example:
CREATE TABLE
create table HiveTest
(EmployeeID Int,FirstName String,Designation String,
Salary Int,Department String)
clustered by (department) into 3 buckets
stored as orc TBLPROPERTIES ('transactional'='true') ;
INSERT
insert into table HiveTest
values(21,'Hive','Hive',0,'B');
UPDATE
UPDATE HiveTest set Salary = 1000
where EmployeeID = 21;
DELETE
DELETE from HiveTest
where EmployeeID = 21;
Export/Write table to local with CSV format
INSERT OVERWRITE LOCAL DIRECTORY 'orderlist_i2custlist.csv'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY ','
SELECT
GOODS_CODE, collect_set(cust_no)
FROM
orderlist
GROUP BY
GOODS_CODE
;
Create an new table from SELECT statement
DROP TABLE IF EXISTS u2i_pref;
CREATE TABLE u2i_pref AS
SELECT
cust_no as uid,
goods_code as gid,
1 as pref_score
FROM
sub_orderlist
;
Output Selections into HDFS with specified file format
set codeName=sohappy;
INSERT OVERWRITE DIRECTORY '/tmp/tmp_table'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
SELECT
page_type, categ_code, gid, ven_guid, api_logtime
FROM ${hiveconf:codeName}_unima.all_weblog
WHERE page_type = 'gop'
LIMIT 100
;
Import/Create an external table from HDFS file
CREATE EXTERNAL TABLE i2i_sim
(
item string,
cooc_item string,
sim double
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION '/user/root/i2i_count';
Notice, $LOCATION, i.e. /user/root/i2i_count
, must be a folder
De-Duplication with Windowing Functions
eliminate the rows with duplicate combination of '''ven_session, gid'''
SELECT *
FROM (
SELECT
ROW_NUMBER() OVER (PARTITION BY ven_session, gid) AS dup_cnt,
ven_session,
gid,
page_type
FROM sohappy_unima.all_weblog
WHERE
page_type = 'gop'
AND ven_session is not NULL
AND ven_session != ''
AND gid is not NULL
AND gid != ''
) gop_only_t
WHERE
dup_cnt <= 1
UPDATE, Delete and Insert under Hive 0.14
Notes
: INSERT...VALUES / UPDATE / DELETE is available starting in Hive 0.14. : INSERT...VALUES / UPDATE / DELETE can only be performed on tables that support ACIDSymptom : FAILED: SemanticException [Error 10294]: Attempt to do update or delete using transaction manager that does not support these operations in hive-0.14.0
Configuration
add hive-site.xml setting as below, then restart hivehive.enforce.bucketing – true hive.exec.dynamic.partition.mode – nonstrict hive.txn.manager –org.apache.hadoop.hive.ql.lockmgr.DbTxnManager hive.compactor.initiator.on – true hive.compactor.worker.threads – 1
Example
CREATE TABLE create table HiveTest (EmployeeID Int,FirstName String,Designation String, Salary Int,Department String) clustered by (department) into 3 buckets stored as orc TBLPROPERTIES ('transactional'='true') ; INSERT insert into table HiveTest values(21,'Hive','Hive',0,'B'); UPDATE UPDATE HiveTest set Salary = 1000 where EmployeeID = 21; DELETE DELETE from HiveTest where EmployeeID = 21;
Hive UDF : Brickhouse Confessions (noted by UFan)
- Brickhouse Downloads
- SOURCE DEFINED FUNCTION
USAGE:
- Add the jar "target/brickhouse-
.jar" to your HIVE_AUX_JARS_FILE_PATH, or add it to the distributed cache from the Hive CLI with the "add jar" command. 我們放 udf jar 檔路徑目前設定在: /usr/lib/hive/auxlib/底下, [http://blog.csdn.net/skywalker_only/article/details/35999955 4種參考解法]. CDH 路徑: /opt/cloudera/parcels/CDH-5.3.2-1.cdh5.3.2.p0.10/lib/hive/auxlib/ HDP 路徑:/usr/lib/hive/auxlib/
Source the UDF declarations defined in src/main/resource/brickhouse.hql
--# add it to the distributed cache OR you can add the jars into local path /usr/lib/hive/auxlib/ and restart Hive. ADD JAR /home/ua00551/brickhouse-0.6.0.jar; --# Source the UDF declarations CREATE TEMPORARY FUNCTION to_json AS 'brickhouse.udf.json.ToJsonUDF'; --# Using to_json() function, convert name_struct into json string select to_json( named_struct("camel_case", 1, "big_double", 9999.99, "the_really_cool_value", "a string val", "another_array", array( 4,5,6), "random_map", map("a",2.3,"b",5.6) ), true) from mytable;
Quick Tips for Using the Hive Shell Inside Scripts (noted by UFan)
Example of linux shell scripts: ex1:Return from Hive query
VAL=$(hive -e "SELECT count(*) FROM yourTable;") echo ${VAL}
ex2:Exe the hive query filehive -f yourHiveQLFile.hql
指令
-f 執行 xxxx.hql 檔案 -e 執行 "hive query" -S 執行 Silent 模式
Hive For RaaS
Hive 檢查資料
統計每日Order List 數量
select to_date(order_date),count(*) from gohappy_unima.all_orderlist group by to_date(order_date) ;
統計每日 web log 數量
select to_date(api_logtime),count(*) from gohappy_unima.all_weblog group by to_date(api_logtime) ;
查詢某日weblog 區間資料
select * from gohappy_unima.all_weblog where api_logtime < unix_timestamp('2015-10-05 00:00:00') and api_logtime > unix_timestamp('2015-10-04 00:00:00') limit 100 ;
檢查每日 weblog中, 傳送空值資料來的瀏覽器agent 統計
select agent,ccnt from ( select agent,count(*) ccnt from raas_system.daily_unima_logdata where partitionfield = '__HIVE_DEFAULT_PARTITION__' group by agent ) a order by ccnt desc ;