Exercise 23 - Hive and compression

compression
hive
#1

Problem statement

  • Compress largedeck using hive
  • Steps
    • Copy data from /public/cards to home directory (using hadoop fs -cp)
    • Create external table pointing to above location
    • set necessary parameters for compression for both map output and reduce output
    • CTAS - change delimiter of the data to , and compress the data

Please provide following details

  • Create external table command
  • CTAS command
  • output of dfs -ls on the location of the table
0 Likes

#2

set hive.exec.compress.output=true;
set hive.exec.compress.intermediate=true;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
set mapreduce.map.output.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
set mapreduce.input.fileinputformat.split.minsize=268435456;
Create external table command

use saswat232;
create external table cards_external (
colour String, type String, num String)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY "|"
location “/user/saswat232/cards”;

CTAS command
create table cmprs_table
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ","
as select * from cards_external;

output of dfs -ls on the location of the table ::

dfs -ls /apps/hive/warehouse/saswat232.db/cmprs_table
> ;
Found 3 items
-rwxrwxrwx 3 saswat232 hdfs 1301604 2016-12-22 01:32 /apps/hive/warehouse/saswat232.db/cmprs_table/000000_0.gz
-rwxrwxrwx 3 saswat232 hdfs 1301602 2016-12-22 01:32 /apps/hive/warehouse/saswat232.db/cmprs_table/000001_0.gz
-rwxrwxrwx 3 saswat232 hdfs 920323 2016-12-22 01:32 /apps/hive/warehouse/saswat232.db/cmprs_table/000002_0.gz

0 Likes

#3

Create external table command

hive> CREATE EXTERNAL TABLE IF NOT EXISTS largedeck
    > (color string,
    > suit string,
    > pip string)
    > ROW FORMAT DELIMITED FIELDS TERMINATED BY "|"
    > location "/user/jasonbourne/cards/";

Parameters:

set hive.exec.compress.output=true;
set hive.exec.compress.intermediate=true;
set mapreduce.output.fileoutputformat.compress=true;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;

CTAS command

hive>
    > CREATE TABLE IF NOT EXISTS largedeckCompressed
    > ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
    > AS SELECT * FROM largedeck;

output of dfs -ls on the location of the table

[jasonbourne@gw01 ~]$ hadoop fs -ls /apps/hive/warehouse/jasonbourne.db/largedeckcompressed
Found 3 items
-rwxrwxrwx   3 jasonbourne hdfs    1301604 2016-12-22 01:36 /apps/hive/warehouse/jasonbourne.db/largedeckcompressed/000000_0.gz
-rwxrwxrwx   3 jasonbourne hdfs    1301602 2016-12-22 01:36 /apps/hive/warehouse/jasonbourne.db/largedeckcompressed/000001_0.gz
-rwxrwxrwx   3 jasonbourne hdfs     920323 2016-12-22 01:36 /apps/hive/warehouse/jasonbourne.db/largedeckcompressed/000002_0.gz
0 Likes

#4

create external table sumanthsharma21.largedeck(col1 string,col2 string,col3 string)
ROW FORMAT SERDE ‘org.apache.hadoop.hive.serde2.OpenCSVSerde’
WITH SERDEPROPERTIES (
“separatorChar” = “|”,
“quoteChar” = "’",
“escapeChar” = “\”
)
STORED AS TEXTFILE
location ‘/user/sumanthsharma21/large_deck/’;
set hive.exec.compress.output=true;
set hive.exec.compress.intermediate=true;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
set mapreduce.map.output.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
set mapreduce.input.fileinputformat.split.minsize=268435456
create table sumanthsharma21.cmpressed_deck
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ","
as select * from sumanthsharma21.largedeck;

hadoop fs -ls /apps/hive/warehouse/sumanthsharma21.db/cmpressed_deck
Found 3 items
-rwxrwxrwx 3 sumanthsharma21 hdfs 1301604 2016-12-22 01:46 /apps/hive/warehouse/sumanthsharma21.db/cmpressed_deck/000000_0.gz
-rwxrwxrwx 3 sumanthsharma21 hdfs 1301602 2016-12-22 01:46 /apps/hive/warehouse/sumanthsharma21.db/cmpressed_deck/000001_0.gz
-rwxrwxrwx 3 sumanthsharma21 hdfs 920323 2016-12-22 01:46 /apps/hive/warehouse/sumanthsharma21.db/cmpressed_deck/000002_0.gz

0 Likes

#5

#Create external table command

set hive.exec.compress.output=true;
set hive.exec.compress.intermediate=true;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
set mapreduce.map.output.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
set mapreduce.input.fileinputformat.split.minsize=268435456; 

CREATE EXTERNAL TABLE IF NOT EXISTS cards( 
color string, 
type string, 
number string) 
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY "|" 
STORED AS TEXTFILE 
location '/user/farhanmisarwala/data/cards/';

#CTAS command

create table compres_cards
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ","
as select * from cards;

#output of dfs -ls on the location of the table

[farhanmisarwala@gw01 ~]$ hadoop fs -ls -h /apps/hive/warehouse/farhan.db/compres_cards
Found 3 items
-rwxrwxrwx   3 farhanmisarwala hdfs      1.2 M 2016-12-22 01:42 /apps/hive/warehouse/farhan.db/compres_cards/000000_0.gz
-rwxrwxrwx   3 farhanmisarwala hdfs      1.2 M 2016-12-22 01:42 /apps/hive/warehouse/farhan.db/compres_cards/000001_0.gz
-rwxrwxrwx   3 farhanmisarwala hdfs    898.8 K 2016-12-22 01:42 /apps/hive/warehouse/farhan.db/compres_cards/000002_0.gz
0 Likes

#6

create external table deckcards(color string,suit string,number int)
> row format delimited
> fields terminated by ‘|’
> location ‘/user/paramesh/cardsext’;

create table deckcompreshive
> row format delimited
> fields terminated by ‘,’
> as select * from deckcards

-rwxrwxrwx 3 paramesh hdfs 1331634 2016-12-22 01:45 /apps/hive/warehouse/paramesh.db/deckcompreshive/000000_0.gz
-rwxrwxrwx 3 paramesh hdfs 1331632 2016-12-22 01:45 /apps/hive/warehouse/paramesh.db/deckcompreshive/000001_0.gz
-rwxrwxrwx 3 paramesh hdfs 941555 2016-12-22 01:45 /apps/hive/warehouse/paramesh.db/deckcompreshive/000002_0.gz

0 Likes

#7

create external table largedeck(color string , type string, number varchar(10))
row format delimited fields terminated by '|'
location ‘/user/parulshine92/cards’;

create table CompressLrageDeck
row format delimited fields terminated by ','
as select * from largedeck;

-rwxrwxrwx 3 parulshine92 hdfs 340602 2016-12-22 01:41 /apps/hive/warehouse/compresslragedeck/000000_0.deflate
-rwxrwxrwx 3 parulshine92 hdfs 340600 2016-12-22 01:41 /apps/hive/warehouse/compresslragedeck/000001_0.deflate
-rwxrwxrwx 3 parulshine92 hdfs 340598 2016-12-22 01:41 /apps/hive/warehouse/compresslragedeck/000002_0.deflate
-rwxrwxrwx 3 parulshine92 hdfs 340597 2016-12-22 01:41 /apps/hive/warehouse/compresslragedeck/000003_0.deflate
-rwxrwxrwx 3 parulshine92 hdfs 340597 2016-12-22 01:41 /apps/hive/warehouse/compresslragedeck/000004_0.deflate
-rwxrwxrwx 3 parulshine92 hdfs 310463 2016-12-22 01:41 /apps/hive/warehouse/compresslragedeck/000005_0.deflate
-rwxrwxrwx 3 parulshine92 hdfs 310462 2016-12-22 01:41 /apps/hive/warehouse/compresslragedeck/000006_0.deflate
-rwxrwxrwx 3 parulshine92 hdfs 310461 2016-12-22 01:41 /apps/hive/warehouse/compresslragedeck/000007_0.deflate
-rwxrwxrwx 3 parulshine92 hdfs 310461 2016-12-22 01:41 /apps/hive/warehouse/compresslragedeck/000008_0.deflate
-rwxrwxrwx 3 parulshine92 hdfs 310460 2016-12-22 01:41 /apps/hive/warehouse/compresslragedeck/000009_0.deflate
-rwxrwxrwx 3 parulshine92 hdfs 269614 2016-12-22 01:41 /apps/hive/warehouse/compresslragedeck/000010_0.deflate

0 Likes

#8

create external table cards (
color string, suit string, pip string
)row format delimited fields terminated by '|'
location ‘/user/aruncse11/arun/cards’;

hive> set hive.exec.compress.output=true;
hive> set mapreduce.output.fileoutputformat.compress=true;
hive> set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
hive> set mapreduce.output.fileoutputformat.compress.type=BLOCK;
hive> set hive.exec.compress.intermediate=true;

hive> create table cmprs_table
> ROW FORMAT DELIMITED
> FIELDS TERMINATED BY “,”
> as select * from cards;

hive> dfs -ls /apps/hive/warehouse/arun.db/cmprs_table;
Found 3 items
-rwxrwxrwx 3 aruncse11 hdfs 1301604 2016-12-22 01:53 /apps/hive/warehouse/arun.db/cmprs_table/000000_0.gz
-rwxrwxrwx 3 aruncse11 hdfs 1301602 2016-12-22 01:53 /apps/hive/warehouse/arun.db/cmprs_table/000001_0.gz
-rwxrwxrwx 3 aruncse11 hdfs 920323 2016-12-22 01:53 /apps/hive/warehouse/arun.db/cmprs_table/000002_0.gz
h

0 Likes

#9

#Create external table command

set hive.exec.compress.intermediate=true;
set hive.exec.compress.output=true;
set mapreduce.input.fileinputformat.split.maxsize=64000000;

create external table largedeck (color string,type string,number int)
row format delimited fields terminated by '|'
location ‘/user/infosnehasish/cards’;

#CTAS command

create table largedeck_compressed
row format delimited fields terminated by ","
as
select * from largedeck;

#output of dfs -ls on the location of the table

hive> dfs -ls -R hdfs://nn01.itversity.com:8020/apps/hive/warehouse/infosnehasish.db/largedeck_compressed;
-rwxrwxrwx 3 infosnehasish hdfs 348461 2016-12-22 01:45 hdfs://nn01.itversity.com:8020/apps/hive/warehouse/infosnehasish.db/largedeck_compressed/000000_0.deflate
-rwxrwxrwx 3 infosnehasish hdfs 348458 2016-12-22 01:45 hdfs://nn01.itversity.com:8020/apps/hive/warehouse/infosnehasish.db/largedeck_compressed/000001_0.deflate
-rwxrwxrwx 3 infosnehasish hdfs 348456 2016-12-22 01:45 hdfs://nn01.itversity.com:8020/apps/hive/warehouse/infosnehasish.db/largedeck_compressed/000002_0.deflate
-rwxrwxrwx 3 infosnehasish hdfs 348455 2016-12-22 01:45 hdfs://nn01.itversity.com:8020/apps/hive/warehouse/infosnehasish.db/largedeck_compressed/000003_0.deflate
-rwxrwxrwx 3 infosnehasish hdfs 348455 2016-12-22 01:45 hdfs://nn01.itversity.com:8020/apps/hive/warehouse/infosnehasish.db/largedeck_compressed/000004_0.deflate
-rwxrwxrwx 3 infosnehasish hdfs 317612 2016-12-22 01:45 hdfs://nn01.itversity.com:8020/apps/hive/warehouse/infosnehasish.db/largedeck_compressed/000005_0.deflate
-rwxrwxrwx 3 infosnehasish hdfs 317611 2016-12-22 01:45 hdfs://nn01.itversity.com:8020/apps/hive/warehouse/infosnehasish.db/largedeck_compressed/000006_0.deflate
-rwxrwxrwx 3 infosnehasish hdfs 317608 2016-12-22 01:45 hdfs://nn01.itversity.com:8020/apps/hive/warehouse/infosnehasish.db/largedeck_compressed/000007_0.deflate
-rwxrwxrwx 3 infosnehasish hdfs 317608 2016-12-22 01:45 hdfs://nn01.itversity.com:8020/apps/hive/warehouse/infosnehasish.db/largedeck_compressed/000008_0.deflate
-rwxrwxrwx 3 infosnehasish hdfs 317608 2016-12-22 01:45 hdfs://nn01.itversity.com:8020/apps/hive/warehouse/infosnehasish.db/largedeck_compressed/000009_0.deflate
-rwxrwxrwx 3 infosnehasish hdfs 275820 2016-12-22 01:44 hdfs://nn01.itversity.com:8020/apps/hive/warehouse/infosnehasish.db/largedeck_compressed/000010_0.deflate

0 Likes

#10

Create table command:

CREATE EXTERNAL TABLE cards_deck(
> color string,
> suit string,
> rank string)
> ROW FORMAT DELIMITED
> FIELDS TERMINATED BY ‘|’
> STORED AS TEXTFILE
> LOCATION ‘/user/mahesh007/cards/’;

CTAS command:

create table cards_compress
> row format delimited
> fields terminated by ‘,’
> as select * from cards_deck;

Output:

hive> dfs -ls /apps/hive/warehouse/mahesh007.db/cards_compress;

Found 3 items
-rwxrwxrwx 3 mahesh007 hdfs 1301604 2016-12-22 02:07 /apps/hive/warehouse/mahesh007.db/cards_compress/000000_0.gz
-rwxrwxrwx 3 mahesh007 hdfs 1301602 2016-12-22 02:07 /apps/hive/warehouse/mahesh007.db/cards_compress/000001_0.gz
-rwxrwxrwx 3 mahesh007 hdfs 920323 2016-12-22 02:07 /apps/hive/warehouse/mahesh007.db/cards_compress/000002_0.gz

0 Likes

#11

set required properties from hive cmd line:

hive> set mapreduce.job.reduces;
mapreduce.job.reduces=6
hive> set mapreduce.input.fileinputformat.split.minsize=64000000;
hive> set hive.exec.compress.intermediate=true;
hive> set hive.exec.compress.output=true;
hive> set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
hive> set mapreduce.map.output.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
hive> show tables;

create table script:

hive> create external table largedeck_external
    > (card_name string,card_type string,)
	> row format delimited fields terminated by '|'
	> location '/user/nagellarajashyam/deck_cards'

hive> create table largedeck_compress
    > row format delimited fields terminated by ','
    > as select * from largedeck_external;

hive> dfs -ls hdfs://nn01.itversity.com:8020/apps/hive/warehouse/nagellarajashyam.db/largedeck_compress;
Found 3 items
-rwxrwxrwx 3 nagellarajashyam hdfs 1301604 2016-12-22 01:54 hdfs://nn01.itversity.com:8020/apps/hive/warehouse/nagellarajashyam.db/largedeck_compress/000000_0.gz
-rwxrwxrwx 3 nagellarajashyam hdfs 1301602 2016-12-22 01:54 hdfs://nn01.itversity.com:8020/apps/hive/warehouse/nagellarajashyam.db/largedeck_compress/000001_0.gz
-rwxrwxrwx 3 nagellarajashyam hdfs 920323 2016-12-22 01:54 hdfs://nn01.itversity.com:8020/apps/hive/warehouse/nagellarajashyam.db/largedeck_compress/000002_0.gz

hive> describe formatted largedeck_compress;
OK

col_name data_type comment

card_name string
card_type string
card_no string

Detailed Table Information

Database: nagellarajashyam
Owner: nagellarajashyam
CreateTime: Thu Dec 22 01:54:29 EST 2016
LastAccessTime: UNKNOWN
Protect Mode: None
Retention: 0
Location: hdfs://nn01.itversity.com:8020/apps/hive/warehouse/nagellarajashyam.db/largedeck_compress
Table Type: MANAGED_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE {“BASIC_STATS”:“true”}
numFiles 3
numRows 54525952
rawDataSize 672137216
totalSize 3523529
transient_lastDdlTime 1482389669

Storage Information

SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
InputFormat: org.apache.hadoop.mapred.TextInputFormat
OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
Compressed: No
Num Buckets: -1
Bucket Columns: []
Sort Columns: []
Storage Desc Params:
field.delim ,
serialization.format ,
Time taken: 0.175 seconds, Fetched: 34 row(s)

0 Likes