http://cloudsqale.com/2018/12/26/storage-tuning-for-mapped-json-conversion-to-orc-file-format-java-heap-issues-with-dictionary-encoding/
Usually in a Data Lake we get source data as compressed JSON payloads (.gz files). Additionally, the first level of JSON objects is often parsed into map<string, string> structure to speed up the access to the first level keys/values, and then get_json_object function can be used to parse further JSON levels whenever required.
CREATE TABLE events
(
event_timestamp string,
event_name string,
app_name string,
app_version string,
platform string,
city string,
country string,
payload map<string, string> -- Mapped JSON data
)
PARTITIONED BY (event_dt string)
STORED AS ORC
LOCATION 's3://cloudsqale/hive/events.db/events'
TBLPROPERTIES ("orc.stripe.size"="268435456", -- 256 MB stripe size
"orc.block.padding"="false","orc.block.padding.tolerance"="1.0");
INSERT OVERWRITE TABLE events PARTITION (event_dt = '2018-12-26')
SELECT
event_timestamp,
event_name,
app_name,
app_version,
platform,
city,
country,
payload
FROM events_raw;