Carlos Aguni

Highly motivated self-taught IT analyst. Always learning and ready to explore new skills. An eternal apprentice.


Hive JSON to ORC

17 Nov 2022 »

http://cloudsqale.com/2018/12/26/storage-tuning-for-mapped-json-conversion-to-orc-file-format-java-heap-issues-with-dictionary-encoding/

Usually in a Data Lake we get source data as compressed JSON payloads (.gz files). Additionally, the first level of JSON objects is often parsed into map<string, string> structure to speed up the access to the first level keys/values, and then get_json_object function can be used to parse further JSON levels whenever required.

CREATE TABLE events
(
  event_timestamp string,
  event_name      string,
  app_name        string,
  app_version     string,
  platform        string,
  city            string,
  country         string,
  payload         map<string, string>   -- Mapped JSON data
)
PARTITIONED BY (event_dt string)
STORED AS ORC
 LOCATION 's3://cloudsqale/hive/events.db/events'
 TBLPROPERTIES ("orc.stripe.size"="268435456",      -- 256 MB stripe size
   "orc.block.padding"="false","orc.block.padding.tolerance"="1.0");

INSERT OVERWRITE TABLE events PARTITION (event_dt = '2018-12-26')
SELECT
  event_timestamp,
  event_name,
  app_name,
  app_version,
  platform,
  city,
  country,
  payload
FROM events_raw;