- Use ORC with bloom filters:
CREATE TABLE employee (
employee_id bigint,
name STRING
) STORED AS ORC
TBLPROPERTIES ("orc.bloom.filter.columns"="employee_id")
;
- Enable PPD with vectorizing, use CBO and Tez:
SET hive.optimize.ppd=true;
SET hive.optimize.ppd.storage=true;
SET hive.vectorized.execution.enabled=true;
SET hive.vectorized.execution.reduce.enabled = true;
SET hive.cbo.enable=true;
set hive.stats.autogather=true;
set hive.compute.query.using.stats=true;
set hive.stats.fetch.partition.stats=true;
set hive.execution.engine=tez;
set hive.stats.fetch.column.stats=true;
set hive.map.aggr=true;
SET hive.tez.auto.reducer.parallelism=true;
-
Tune proper parallelism on mappers and reducers:
–example for mappers:
set tez.grouping.max-size=67108864; set tez.grouping.min-size=32000000;
–example settings for reducers:
set hive.exec.reducers.bytes.per.reducer=67108864; --decrease this to increase the number of reducers
Change these figures to achieve optimal performance.