prologue
yum -y install java-1.8.0-openjdk-devel
yum -y install screen
yum -y install htop
export HADOOP_HOME="/root/hadoop-3.3.2"
export PATH="${HADOOP_HOME}/bin:$PATH"
export HIVE_HOME="/root/apache-hive-3.1.2-bin"
export PATH="${HIVE_HOME}/bin:$PATH"
export JAVA_HOME="/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.332.b09-1.el8_5.x86_64"
export HADOOP_CLASSPATH=/root/hadoop-3.3.2/share/hadoop/tools/lib/*
export HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:/root/hadoop-3.3.2/share/hadoop/hdfs/lib/*"
export HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:/root/hadoop-3.3.2/share/hadoop/common/lib/*"
export HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:/root/hadoop-3.3.2/share/hadoop/client/lib/*"
export HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:/root/hadoop-3.3.2/share/hadoop/mapreduce/lib/*"
export HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:/root/hadoop-3.3.2/share/hadoop/yarn/lib/*"
# tez
export HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:/root/apache-tez-0.9.2-bin/lib/*"
export HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:/root/apache-tez-0.9.2-bin/*"
export TEZ_HOME=/root/apache-tez-0.9.2-bin
# hdfs + yarn
export HDFS_NAMENODE_USER="root"
export HDFS_DATANODE_USER="root"
export HDFS_SECONDARYNAMENODE_USER="root"
export YARN_RESOURCEMANAGER_USER="root"
export YARN_NODEMANAGER_USER="root"
export HIVE="beeline -u 'jdbc:hive2://localhost:10000' "
.vimrc
set tabstop=4 " display tab as 4-space wide
set shiftwidth=4 " identation width when using << and >>
set expandtab
install tpc-ds
yum -y groupinstall "Development tools"
git clone https://github.com/hortonworks/hive-testbench.git
cd hive-testbench
# yum -y install java-11-openjdk java-11-openjdk-devel
# cloudera openjdk8 already installed
./tpcds-build.sh
install (hadoop + hdfs + yarn) + hive + tez
curl -O https://dlcdn.apache.org/hadoop/common/hadoop-3.3.2/hadoop-3.3.2.tar.gz
tar xzvf hadoop-3.3.2.tar.gz
curl -O https://dlcdn.apache.org/hive/hive-3.1.2/apache-hive-3.1.2-bin.tar.gz
tar xzvf apache-hive-3.1.2-bin.tar.gz
curl -O https://dlcdn.apache.org/tez/0.9.2/apache-tez-0.9.2-bin.tar.gz
tar xzvf apache-tez-0.9.2-bin.tar.gz
configure hadoop
core-site.xml
cat > ${HADOOP_HOME}/etc/hadoop/core-site.xml <<EOF
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
EOF
configure hdfs
https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html
${HADOOP_HOME}/sbin/hdfs
cat > ${HADOOP_HOME}/etc/hadoop/hdfs-site.xml <<EOF
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>
EOF
Assure ssh to
localhost
is working
[ ! -d ~/.ssh ] && ssh-keygen -t rsa -N -f ~/.ssh/id_rsa
[ ! -f ~/.ssh/id_rsa ] && ssh-keygen -t rsa -N -f ~/.ssh/id_rsa
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
-
Format the system
hdfs namenode -format
-
Start NameNode and DataNode daemon:
${HADOOP_HOME}/sbin/start-dfs.sh
https://stackoverflow.com/questions/48129029/hdfs-namenode-user-hdfs-datanode-user-hdfs-secondarynamenode-user-not-defined
export HDFS_NAMENODE_USER="root" export HDFS_DATANODE_USER="root" export HDFS_SECONDARYNAMENODE_USER="root" export YARN_RESOURCEMANAGER_USER="root" export YARN_NODEMANAGER_USER="root"
-
Browse the web interface for the NameNode; by default available at: NameNode - http://localhost:9870
-
Make the HDFS directories required to execute MapReduce jobs:
hdfs dfs -mkdir /user
hdfs dfs -mkdir /user/<username>
- Copy file
hdfs -put <file> /user/
Start yarn
mapred-site.xml
cat > ${HADOOP_HOME}/etc/hadoop/mapred-site.xml <<EOF
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl"
href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>yarn.app.mapreduce.am.env</name>
<value>HADOOP_HOME=${HADOOP_HOME}</value>
</property>
<property>
<name>mapreduce.map.env</name>
<value>HADOOP_HOME=${HADOOP_HOME}</value>
</property>
<property>
<name>mapreduce.reduce.env</name>
<value>HADOOP_HOME=${HADOOP_HOME}</value>
</property>
<property>
<name>yarn.app.mapreduce.am.env</name>
<value>HADOOP_MAPRED_HOME=\${HADOOP_HOME}</value>
</property>
<property>
<name>mapreduce.map.env</name>
<value>HADOOP_MAPRED_HOME=\${HADOOP_HOME}</value>
</property>
<property>
<name>mapreduce.reduce.env</name>
<value>HADOOP_MAPRED_HOME=\${HADOOP_HOME}</value>
</property>
<property>
<name>yarn.application.classpath</name>
<value>
${HADOOP_HOME}/etc/*,
${HADOOP_HOME}/etc/hadoop/*,
${HADOOP_HOME}/lib/*,
${HADOOP_HOME}/share/hadoop/common/*,
${HADOOP_HOME}/share/hadoop/common/lib/*,
${HADOOP_HOME}/share/hadoop/mapreduce/*,
${HADOOP_HOME}/share/hadoop/mapreduce/lib-examples/*,
${HADOOP_HOME}/share/hadoop/hdfs/*,
${HADOOP_HOME}/share/hadoop/hdfs/lib/*,
${HADOOP_HOME}/share/hadoop/yarn/*,
${HADOOP_HOME}/share/hadoop/yarn/lib/*,
</value>
</property>
</configuration>
EOF
yarn-site.xml
cat > ${HADOOP_HOME}/etc/hadoop/yarn-site.xml <<EOF
<configuration>
<!-- Site specific YARN configuration properties -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_
HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_
DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
</property>
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>8000</value>
</property>
<property>
<name>yarn.nodemanager.maximum-allocation-mb</name>
<value>8000</value>
</property>
<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>8</value>
</property>
</configuration>
EOF
export YARN_RESOURCEMANAGER_USER=root
export YARN_NODEMANAGER_USER=root
/root/hadoop-3.3.2/sbin/start-yarn.sh
caveat
Caused by: javax.jdo.JDOFatalUserException: Class “org.apache.hadoop.hive.metastore.model.MVersionTable” field “org.apache.hadoop.hive.metastore.model.MVersionTable.schemaVersionV2” : declared in MetaData, but this field doesnt exist in the class! Reason: was using clouderas’s java Solution: use java-1.8.0
caveat
Error: Error while processing statement: FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.mr.MapRedTask (state=08S01,code=2)
https://community.cloudera.com/t5/Support-Questions/Error-loading-csv-file-using-hive/td-p/205439
maybe “If you are using Hortonworks distribution then its better to use ORC format as it is optimized for TEZ execution engine and if you are considering about file size then ORC is more compressed than Parquet. “
Best Practices of ORC in HCC :-
https://community.hortonworks.com/articles/75501/orc-creation-best-practices.html
Pros and Cons of Parquet format:-
https://stackoverflow.com/questions/36822224/what-are-the-pros-and-cons-of-parquet-format-compared-t…
Orc vs Parquet:-
https://community.hortonworks.com/questions/2067/orc-vs-parquet-when-to-use-one-over-the-other.html
utils
${HADOOP_HOME}/sbin/start-all.sh
${HADOOP_HOME}/sbin/stop-all.sh
check services jps
[root@hivebench ~]# jps
30535 ResourceManager
29963 DataNode
30235 SecondaryNameNode
31052 Jps
29805 NameNode
30687 NodeManager
configure hive
vim ${HIVE_HOME}/conf/hive-site.xml
cat > ${HIVE_HOME}/conf/hive-site.xml <<EOF
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>hive.server2.enable.doAs</name>
<value>FALSE</value>
<description>
Setting this property to true will have HiveServer2 execute
Hive operations as the user making the calls to it.
</description>
</property>
<property>
<name>system:java.io.tmpdir</name>
<value>/tmp/hive/java</value>
</property>
<property>
<name>system:user.name</name>
<value>\${user.name}</value>
</property>
</configuration>
EOF
https://george-jen.gitbook.io/data-science-and-apache-spark/hive-home
hdfs dfs -mkdir /tmp
hdfs dfs -mkdir /user
hdfs dfs -mkdir /user/hive
hdfs dfs -mkdir /user/hive/warehouse
hdfs dfs -chmod g+w /user/hive/warehouse
hdfs dfs -chmod g+w /tmp
initialize hive schema
pushd ${HIVE_HOME}
schematool -dbType derby -initSchema -verbose
popd
small caveat start HS2 before HMS
hive --service hiveserver2 --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.root.logger=INFO,console
screen -L -Logfile /tmp/hive.log -dmS hive bash -c "cd ${HIVE_HOME}; hive --service hiveserver2 --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.root.logger=INFO,console"
hive --service metastore --hiveconf hive.root.logger=INFO,console
screen -L -Logfile /tmp/hms.log -dmS hms bash -c "cd ${HIVE_HOME}; hive --service metastore --hiveconf hive.root.logger=INFO,console"
export HIVE="beeline -u 'jdbc:hive2://localhost:10000' "
caveats
caveat
User: root is not allowed to impersonate anonymous
https://stackoverflow.com/questions/43180305/cannot-connect-to-hive-using-beeline-user-root-cannot-impersonate-anonymous
hive-site.xml
<property>
<name>hive.server2.enable.doAs</name>
<value>FALSE</value>
<description>
Setting this property to true will have HiveServer2 execute
Hive operations as the user making the calls to it.
</description>
</property>
caveat
Exception in thread “main” java.lang.IllegalArgumentException: java.net.URISyntaxException: Relative path in absolute URI: ${system:java.io.tmpdir%7D/$%7Bsystem:user.name%7D
https://stackoverflow.com/questions/27099898/java-net-urisyntaxexception-when-starting-hive add to hive-site.xml
<property>
<name>system:java.io.tmpdir</name>
<value>/tmp/hive/java</value>
</property>
<property>
<name>system:user.name</name>
<value>${user.name}</value>
</property>
HIVE="beeline -u 'jdbc:hive2://localhost:10000' "
install tez
curl -O https://dlcdn.apache.org/tez/0.9.2/apache-tez-0.9.2-bin.tar.gz
tar xzvf apache-tez-0.9.2-bin.tar.gz
export HADOOP_CLASSPATH="$HADOOP_CLASSPATH:/root/apache-tez-0.9.2-bin/lib/*"
export HADOOP_CLASSPATH="$HADOOP_CLASSPATH:/root/apache-tez-0.9.2-bin/*"
export TEZ_HOME=/root/apache-tez-0.9.2-bin
copy tez.tar.gz
pushd $TEZ_HOME
hadoop fs -mkdir /apps/
popd
pushd $TEZ_HOME
hadoop fs -copyFromLocal share/tez.tar.gz /apps/
popd
add to tez-site.xml
cat > ${TEZ_HOME}/conf/tez-site.xml <<EOF
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- WARNING: THIS IS A GENERATED TEMPLATE PURELY FOR DOCUMENTATION PURPOSES
AND SHOULD NOT BE USED AS A CONFIGURATION FILE FOR TEZ -->
<configuration>
<property>
<name>tez.lib.uris</name>
<value>/apps/tez.tar.gz</value>
<description>String value to a file path.
The location of the Tez libraries which will be localized for DAGs.
This follows the following semantics
</description>
<type>string</type>
</property>
</configuration>
EOF
caveats
org.apache.hadoop.yarn.exceptions.InvalidAuxServiceException: The auxService:mapreduce_shuffle does not exist
https://stackoverflow.com/questions/33345262/org-apache-hadoop-yarn-exceptions-invalidauxserviceexception-the-auxservicemap
yarn-site.xml
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>
hive-bench
/root/hadoop-3.3.2/sbin/stop-all.sh
sleep 2
/root/hadoop-3.3.2/sbin/stop-all.sh
sleep 2
/root/hadoop-3.3.2/sbin/start-all.sh
sleep 5
hadoop fs -rm -R /tmp/tpcds-generate
./tpcds-setup.sh 50
On CDP Private
.vimrc
cat >> ~/.vimrc <EOF
set tabstop=4 " display tab as 4-space wide
set shiftwidth=4 " identation width when using << and >>
set expandtab
EOF
install tcpds
yum -y groupinstall "Development tools"
git clone https://github.com/hortonworks/hive-testbench.git
cd hive-testbench
# yum -y install java-11-openjdk java-11-openjdk-devel
# cloudera openjdk8 already installed
./tpcds-build.sh