Carlos Aguni

Highly motivated self-taught IT analyst. Always learning and ready to explore new skills. An eternal apprentice.


Hive-bench Study

20 May 2022 »

prologue

yum -y install java-1.8.0-openjdk-devel
yum -y install screen
yum -y install htop
export HADOOP_HOME="/root/hadoop-3.3.2"
export PATH="${HADOOP_HOME}/bin:$PATH"
export HIVE_HOME="/root/apache-hive-3.1.2-bin"
export PATH="${HIVE_HOME}/bin:$PATH"
export JAVA_HOME="/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.332.b09-1.el8_5.x86_64"
export HADOOP_CLASSPATH=/root/hadoop-3.3.2/share/hadoop/tools/lib/*
export HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:/root/hadoop-3.3.2/share/hadoop/hdfs/lib/*"
export HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:/root/hadoop-3.3.2/share/hadoop/common/lib/*"
export HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:/root/hadoop-3.3.2/share/hadoop/client/lib/*"
export HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:/root/hadoop-3.3.2/share/hadoop/mapreduce/lib/*"
export HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:/root/hadoop-3.3.2/share/hadoop/yarn/lib/*"


# tez
export HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:/root/apache-tez-0.9.2-bin/lib/*"
export HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:/root/apache-tez-0.9.2-bin/*"
export TEZ_HOME=/root/apache-tez-0.9.2-bin

# hdfs + yarn
export HDFS_NAMENODE_USER="root"
export HDFS_DATANODE_USER="root"
export HDFS_SECONDARYNAMENODE_USER="root"
export YARN_RESOURCEMANAGER_USER="root"
export YARN_NODEMANAGER_USER="root"


export HIVE="beeline -u 'jdbc:hive2://localhost:10000'  "


.vimrc

set tabstop=4     " display tab as 4-space wide
set shiftwidth=4  " identation width when using << and >>
set expandtab

install tpc-ds

yum -y groupinstall "Development tools"
git clone https://github.com/hortonworks/hive-testbench.git
cd hive-testbench
# yum -y install java-11-openjdk java-11-openjdk-devel
# cloudera openjdk8 already installed
./tpcds-build.sh

install (hadoop + hdfs + yarn) + hive + tez

curl -O https://dlcdn.apache.org/hadoop/common/hadoop-3.3.2/hadoop-3.3.2.tar.gz
tar xzvf hadoop-3.3.2.tar.gz 

curl -O https://dlcdn.apache.org/hive/hive-3.1.2/apache-hive-3.1.2-bin.tar.gz
tar xzvf apache-hive-3.1.2-bin.tar.gz 

curl -O https://dlcdn.apache.org/tez/0.9.2/apache-tez-0.9.2-bin.tar.gz
tar xzvf apache-tez-0.9.2-bin.tar.gz

configure hadoop

core-site.xml

cat > ${HADOOP_HOME}/etc/hadoop/core-site.xml <<EOF
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
         <property>
        <name>fs.defaultFS</name>
        <value>hdfs://localhost:9000</value>
    </property>
</configuration>
EOF

configure hdfs

https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html

${HADOOP_HOME}/sbin/hdfs

cat > ${HADOOP_HOME}/etc/hadoop/hdfs-site.xml <<EOF
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>
</configuration>
EOF

Assure ssh to localhost is working

[ ! -d ~/.ssh ] && ssh-keygen -t rsa -N -f ~/.ssh/id_rsa
[ ! -f ~/.ssh/id_rsa ] && ssh-keygen -t rsa -N -f ~/.ssh/id_rsa
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
  1. Format the system hdfs namenode -format

  2. Start NameNode and DataNode daemon: ${HADOOP_HOME}/sbin/start-dfs.sh

    https://stackoverflow.com/questions/48129029/hdfs-namenode-user-hdfs-datanode-user-hdfs-secondarynamenode-user-not-defined

     export HDFS_NAMENODE_USER="root"
     export HDFS_DATANODE_USER="root"
     export HDFS_SECONDARYNAMENODE_USER="root"
     export YARN_RESOURCEMANAGER_USER="root"
     export YARN_NODEMANAGER_USER="root"
    
  3. Browse the web interface for the NameNode; by default available at: NameNode - http://localhost:9870

  4. Make the HDFS directories required to execute MapReduce jobs:

hdfs dfs -mkdir /user
hdfs dfs -mkdir /user/<username>
  1. Copy file
hdfs -put <file> /user/

Start yarn

mapred-site.xml

cat > ${HADOOP_HOME}/etc/hadoop/mapred-site.xml <<EOF
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl"
href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
    <property>
      <name>yarn.app.mapreduce.am.env</name>
      <value>HADOOP_HOME=${HADOOP_HOME}</value>
    </property>
    <property>
      <name>mapreduce.map.env</name>
      <value>HADOOP_HOME=${HADOOP_HOME}</value>
    </property>
    <property>
      <name>mapreduce.reduce.env</name>
      <value>HADOOP_HOME=${HADOOP_HOME}</value>
    </property>
    <property>
      <name>yarn.app.mapreduce.am.env</name>
      <value>HADOOP_MAPRED_HOME=\${HADOOP_HOME}</value>
    </property>
    <property>
      <name>mapreduce.map.env</name>
      <value>HADOOP_MAPRED_HOME=\${HADOOP_HOME}</value>
    </property>
    <property>
      <name>mapreduce.reduce.env</name>
      <value>HADOOP_MAPRED_HOME=\${HADOOP_HOME}</value>
    </property>
    <property>
    <name>yarn.application.classpath</name>
    <value>
        ${HADOOP_HOME}/etc/*,
        ${HADOOP_HOME}/etc/hadoop/*,
        ${HADOOP_HOME}/lib/*,
        ${HADOOP_HOME}/share/hadoop/common/*,
        ${HADOOP_HOME}/share/hadoop/common/lib/*,
        ${HADOOP_HOME}/share/hadoop/mapreduce/*,
        ${HADOOP_HOME}/share/hadoop/mapreduce/lib-examples/*,
        ${HADOOP_HOME}/share/hadoop/hdfs/*,
        ${HADOOP_HOME}/share/hadoop/hdfs/lib/*,
        ${HADOOP_HOME}/share/hadoop/yarn/*,
        ${HADOOP_HOME}/share/hadoop/yarn/lib/*,
    </value>
</property>
</configuration>
EOF

yarn-site.xml

cat > ${HADOOP_HOME}/etc/hadoop/yarn-site.xml <<EOF
<configuration>
    <!-- Site specific YARN configuration properties -->
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    <property>
        <name>yarn.nodemanager.env-whitelist</name>
        <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_
        HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_
        DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
    </property>
    <property>
        <name>yarn.nodemanager.resource.memory-mb</name>
        <value>8000</value>
    </property>
    <property>
        <name>yarn.nodemanager.maximum-allocation-mb</name>
        <value>8000</value>
    </property>
    <property>
        <name>yarn.nodemanager.resource.cpu-vcores</name>
        <value>8</value>
    </property>
</configuration>
EOF
export YARN_RESOURCEMANAGER_USER=root
export YARN_NODEMANAGER_USER=root
/root/hadoop-3.3.2/sbin/start-yarn.sh

caveat

Caused by: javax.jdo.JDOFatalUserException: Class “org.apache.hadoop.hive.metastore.model.MVersionTable” field “org.apache.hadoop.hive.metastore.model.MVersionTable.schemaVersionV2” : declared in MetaData, but this field doesnt exist in the class! Reason: was using clouderas’s java Solution: use java-1.8.0

caveat

Error: Error while processing statement: FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.mr.MapRedTask (state=08S01,code=2)

https://community.cloudera.com/t5/Support-Questions/Error-loading-csv-file-using-hive/td-p/205439

maybe “If you are using Hortonworks distribution then its better to use ORC format as it is optimized for TEZ execution engine and if you are considering about file size then ORC is more compressed than Parquet. “

Best Practices of ORC in HCC :-

https://community.hortonworks.com/articles/75501/orc-creation-best-practices.html

Pros and Cons of Parquet format:-

https://stackoverflow.com/questions/36822224/what-are-the-pros-and-cons-of-parquet-format-compared-t…

Orc vs Parquet:-

https://community.hortonworks.com/questions/2067/orc-vs-parquet-when-to-use-one-over-the-other.html

utils

${HADOOP_HOME}/sbin/start-all.sh
${HADOOP_HOME}/sbin/stop-all.sh

check services jps

[root@hivebench ~]# jps
30535 ResourceManager
29963 DataNode
30235 SecondaryNameNode
31052 Jps
29805 NameNode
30687 NodeManager

configure hive

vim ${HIVE_HOME}/conf/hive-site.xml

cat > ${HIVE_HOME}/conf/hive-site.xml <<EOF
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
    <property>
        <name>hive.server2.enable.doAs</name>
        <value>FALSE</value>
        <description>
        Setting this property to true will have HiveServer2 execute
        Hive operations as the user making the calls to it.
        </description>
    </property> 
    <property>
      <name>system:java.io.tmpdir</name>
      <value>/tmp/hive/java</value>
    </property>
    <property>
      <name>system:user.name</name>
      <value>\${user.name}</value>
    </property>
</configuration>
EOF

https://george-jen.gitbook.io/data-science-and-apache-spark/hive-home

hdfs dfs -mkdir /tmp
hdfs dfs -mkdir /user
hdfs dfs -mkdir /user/hive
hdfs dfs -mkdir /user/hive/warehouse
hdfs dfs -chmod g+w /user/hive/warehouse
hdfs dfs -chmod g+w /tmp

initialize hive schema

pushd ${HIVE_HOME}
schematool -dbType derby -initSchema -verbose
popd

small caveat start HS2 before HMS

hive --service hiveserver2 --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.root.logger=INFO,console
screen -L -Logfile /tmp/hive.log -dmS hive bash -c "cd ${HIVE_HOME}; hive --service hiveserver2 --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.root.logger=INFO,console"       
hive --service metastore --hiveconf hive.root.logger=INFO,console
screen -L -Logfile /tmp/hms.log -dmS hms bash -c "cd ${HIVE_HOME}; hive --service metastore --hiveconf hive.root.logger=INFO,console" 
export HIVE="beeline -u 'jdbc:hive2://localhost:10000'  "

caveats

caveat

User: root is not allowed to impersonate anonymous

https://stackoverflow.com/questions/43180305/cannot-connect-to-hive-using-beeline-user-root-cannot-impersonate-anonymous

hive-site.xml

<property>
    <name>hive.server2.enable.doAs</name>
    <value>FALSE</value>
    <description>
    Setting this property to true will have HiveServer2 execute
    Hive operations as the user making the calls to it.
    </description>
</property> 

caveat

Exception in thread “main” java.lang.IllegalArgumentException: java.net.URISyntaxException: Relative path in absolute URI: ${system:java.io.tmpdir%7D/$%7Bsystem:user.name%7D

https://stackoverflow.com/questions/27099898/java-net-urisyntaxexception-when-starting-hive add to hive-site.xml

  <property>
    <name>system:java.io.tmpdir</name>
    <value>/tmp/hive/java</value>
  </property>
  <property>
    <name>system:user.name</name>
    <value>${user.name}</value>
  </property>
HIVE="beeline -u 'jdbc:hive2://localhost:10000'  "

install tez

curl -O https://dlcdn.apache.org/tez/0.9.2/apache-tez-0.9.2-bin.tar.gz
tar xzvf apache-tez-0.9.2-bin.tar.gz
export HADOOP_CLASSPATH="$HADOOP_CLASSPATH:/root/apache-tez-0.9.2-bin/lib/*"
export HADOOP_CLASSPATH="$HADOOP_CLASSPATH:/root/apache-tez-0.9.2-bin/*"
export TEZ_HOME=/root/apache-tez-0.9.2-bin

copy tez.tar.gz

pushd $TEZ_HOME
hadoop fs -mkdir /apps/
popd
pushd $TEZ_HOME
hadoop fs -copyFromLocal share/tez.tar.gz /apps/
popd

add to tez-site.xml

cat > ${TEZ_HOME}/conf/tez-site.xml <<EOF
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- WARNING: THIS IS A GENERATED TEMPLATE PURELY FOR DOCUMENTATION PURPOSES
 AND SHOULD NOT BE USED AS A CONFIGURATION FILE FOR TEZ -->

<configuration>


  <property>
    <name>tez.lib.uris</name>
    <value>/apps/tez.tar.gz</value>
    <description>String value to a file path.
 The location of the Tez libraries which will be localized for DAGs.
 This follows the following semantics
    </description>
    <type>string</type>
  </property>


</configuration>
EOF

caveats

org.apache.hadoop.yarn.exceptions.InvalidAuxServiceException: The auxService:mapreduce_shuffle does not exist

https://stackoverflow.com/questions/33345262/org-apache-hadoop-yarn-exceptions-invalidauxserviceexception-the-auxservicemap

yarn-site.xml

<configuration>
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
</configuration>

hive-bench

/root/hadoop-3.3.2/sbin/stop-all.sh
sleep 2
/root/hadoop-3.3.2/sbin/stop-all.sh
sleep 2
/root/hadoop-3.3.2/sbin/start-all.sh
sleep 5
hadoop fs -rm -R /tmp/tpcds-generate
./tpcds-setup.sh 50

On CDP Private

.vimrc

cat >> ~/.vimrc <EOF
set tabstop=4     " display tab as 4-space wide
set shiftwidth=4  " identation width when using << and >>
set expandtab
EOF

install tcpds

yum -y groupinstall "Development tools"
git clone https://github.com/hortonworks/hive-testbench.git
cd hive-testbench
# yum -y install java-11-openjdk java-11-openjdk-devel
# cloudera openjdk8 already installed
./tpcds-build.sh