Skip to content

Instantly share code, notes, and snippets.

@bartekdobija
Last active October 30, 2015 14:52
Show Gist options
  • Save bartekdobija/6a95b5c7352e85d0d401 to your computer and use it in GitHub Desktop.
Save bartekdobija/6a95b5c7352e85d0d401 to your computer and use it in GitHub Desktop.
Cloudera Vagrantfile
# Anaconda dependencies
$anaconda_deps = <<SCRIPT
ANACONDA_INSTALLER=https://3230d63b5fc54e62148e-c95ac804525aac4b6dba79b00b39d1d3.ssl.cf1.rackcdn.com/Anaconda-2.3.0-Linux-x86_64.sh
if [ ! -d "/usr/local/anaconda" ]; then
echo "Anaconda installation..." \
&& echo "downloading binaries" \
&& wget ${ANACONDA_INSTALLER} -q -P /tmp/ \
&& echo "running installer" \
&& bash /tmp/Anaconda-2.3.0-Linux-x86_64.sh -b -f -p /usr/local/anaconda
fi
SCRIPT
# MySQL dependencies
$mysql_deps = <<SCRIPT
MYSQL_REPO=https://dev.mysql.com/get/mysql-community-release-el6-5.noarch.rpm
MY_CNF=/etc/my.cnf
DEV_PASSWORD=hadoop
[ ! -e /etc/yum.repos.d/mysql-community.repo ] && rpm -ivh ${MYSQL_REPO}
yum install -y mysql-community-server
if [ -e /etc/init.d/mysqld ] && [ -z "$(grep -R vagrant ${MY_CNF})" ]; then
echo "# InnoDB settings" >> ${MY_CNF}
echo "default_storage_engine = innodb" >> ${MY_CNF}
echo "innodb_file_per_table = 1" >> ${MY_CNF}
echo "innodb_flush_log_at_trx_commit = 2" >> ${MY_CNF}
echo "innodb_log_buffer_size = 64M" >> ${MY_CNF}
echo "innodb_buffer_pool_size = 1G" >> ${MY_CNF}
echo "innodb_thread_concurrency = 8" >> ${MY_CNF}
echo "innodb_flush_method = O_DIRECT" >> ${MY_CNF}
echo "innodb_log_file_size = 512M" >> ${MY_CNF}
echo "explicit_defaults_for_timestamp = 1" >> ${MY_CNF}
chkconfig mysqld on \
&& service mysqld start \
&& /usr/bin/mysqladmin -u root password "${DEV_PASSWORD}" &> /dev/null \
&& echo "# vagrant provisioned" >> ${MY_CNF}
mysql -u root -p${DEV_PASSWORD} \
-e "create schema if not exists hive; grant all on hive.* to 'hive'@'localhost' identified by 'hive'"
fi
SCRIPT
# Spark dependencies
$spark_deps = <<SCRIPT
SPARK_TGZ=spark-1.5.1-bin-without-hadoop.tgz
SPARK_LINK=/opt/spark
[ ! -e ${SPARK_LINK} ] \
&& echo "Spark installation..." \
&& echo "downloading binaries" \
&& wget http://ftp.heanet.ie/mirrors/www.apache.org/dist/spark/spark-1.5.1/${SPARK_TGZ} -q -P /opt/ \
&& tar zxf /opt/${SPARK_TGZ} -C /opt/ \
&& ln -s /opt/spark-1.5.1-bin-without-hadoop ${SPARK_LINK}
[ ! -e /opt/${SPARK_TGZ} ] && exit 1
echo "Spark configuration..."
echo "configuring /etc/profile.d/spark.sh"
echo 'export PATH=$PATH'":${SPARK_LINK}/bin" > /etc/profile.d/spark.sh
echo "configuring /opt/spark/conf/spark-env.sh"
cat << SPCNF > /opt/spark/conf/spark-env.sh
HADOOP_CONF_DIR=/etc/hadoop/conf/
SPARK_DIST_CLASSPATH=\\$(hadoop classpath)
LD_LIBRARY_PATH=\\${LD_LIBRARY_PATH}:/opt/cloudera/parcels/CDH/lib/hadoop/lib/native/
SPCNF
echo "configuring ${SPARK_LINK}/conf/spark-defaults.conf"
cat << SPCNF > ${SPARK_LINK}/conf/spark-defaults.conf
spark.shuffle.service.enabled true
# Execution Behavior
spark.broadcast.blockSize 4096
# Dynamic Resource Allocation (YARN)
spark.dynamicAllocation.enabled true
spark.speculation true
spark.scheduler.mode FAIR
spark.kryoserializer.buffer.max 1000m
spark.driver.maxResultSize 0
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.yarn.preserve.staging.files false
spark.master yarn
spark.rdd.compress true
# Local execution of selected Spark functions
spark.localExecution.enabled true
spark.sql.parquet.binaryAsString true
spark.sql.parquet.compression.codec snappy
# use lz4 compression for broadcast variables as Snappy is not supported on MacOSX
spark.broadcast.compress true
spark.io.compression.codec lz4
spark.driver.extraLibraryPath /opt/cloudera/parcels/CDH/lib/hadoop/lib/native
spark.executor.extraLibraryPath /opt/cloudera/parcels/CDH/lib/hadoop/lib/native
spark.executor.extraJavaOptions -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+UseCompressedOops
spark.driver.extraJavaOptions -XX:+UseCompressedOops -XX:MaxPermSize=1g
spark.executor.extraClassPath /usr/local/lib/jdbc/sqlserver/*.jar:/usr/local/lib/jdbc/mysql/*.jar:/usr/local/anaconda/bin:/opt/udfs/hive/*.jar
spark.driver.extraClassPath /usr/local/lib/jdbc/sqlserver/*.jar:/usr/local/lib/jdbc/mysql/*.jar:/usr/local/anaconda/bin:/opt/udfs/hive/*.jar
SPCNF
echo "configuring ${SPARK_LINK}/conf/hive-site.xml"
cat << HIVECNF > ${SPARK_LINK}/conf/hive-site.xml
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://localhost:3306/hive?createDatabaseIfNotExist=true</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>hive</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>hive</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>hive.metastore.uris</name>
<value>thrift://cdh.instance.com:9083</value>
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>hdfs:///user/hive/warehouse</value>
</property>
</configuration>
HIVECNF
echo "installing resource scheduler" \
&& mkdir -p /usr/lib/hadoop-yarn/lib/ \
&& cp -f ${SPARK_LINK}/lib/spark-*-yarn-shuffle.jar /usr/lib/hadoop-yarn/lib/
SCRIPT
# Cloudera CDH dependencies
$cloudera_deps = <<SCRIPT
CLOUDERA_REPO=http://archive.cloudera.com/cdh5/redhat/6/x86_64/cdh/cloudera-cdh5.repo
# Add Cloudera repository
[ ! -e /etc/yum.repos.d/cloudera-cdh5.repo ] \
&& wget ${CLOUDERA_REPO} -q -P /etc/yum.repos.d/
# Cloudera Hadoop installation
yum install -y java-1.7.0-openjdk java-1.7.0-openjdk-devel hadoop \
hadoop-conf-pseudo hadoop-hdfs-datanode hadoop-hdfs-journalnode \
hadoop-hdfs-namenode hadoop-hdfs-secondarynamenode hadoop-hdfs-zkfc \
hadoop-libhdfs-devel hadoop-mapreduce-historyserver hadoop-yarn-nodemanager \
hadoop-yarn-resourcemanager zookeeper zookeeper-native zookeeper-server \
oozie oozie-client kite sqoop hive hive-metastore hive-server2 hive-hcatalog \
hive-jdbc avro-libs pig kite impala*
cat << HDPCNF > /etc/hadoop/conf/mapred-site.xml
<configuration>
<property>
<name>mapred.job.tracker</name>
<value>localhost:8021</value>
</property>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>localhost:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>localhost:19888</value>
</property>
<property>
<description>To set the value of tmp directory for map and reduce tasks.</description>
<name>mapreduce.task.tmp.dir</name>
<value>/var/lib/hadoop-mapreduce/cache/\\${user.name}/tasks</value>
</property>
<property>
<name>mapreduce.map.memory.mb</name>
<value>512</value>
</property>
<property>
<name>mapreduce.reduce.memory.mb</name>
<value>512</value>
</property>
</configuration>
HDPCNF
cat << YRNCNF > /etc/hadoop/conf/yarn-site.xml
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle,spark_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.spark_shuffle.class</name>
<value>org.apache.spark.network.yarn.YarnShuffleService</value>
</property>
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.dispatcher.exit-on-error</name>
<value>true</value>
</property>
<property>
<description>List of directories to store localized files in.</description>
<name>yarn.nodemanager.local-dirs</name>
<value>/var/lib/hadoop-yarn/cache/\\${user.name}/nm-local-dir</value>
</property>
<property>
<description>Where to store container logs.</description>
<name>yarn.nodemanager.log-dirs</name>
<value>/var/log/hadoop-yarn/containers</value>
</property>
<property>
<description>Where to aggregate logs to.</description>
<name>yarn.nodemanager.remote-app-log-dir</name>
<value>/var/log/hadoop-yarn/apps</value>
</property>
<property>
<description>Classpath for typical applications.</description>
<name>yarn.application.classpath</name>
<value>\\$HADOOP_CONF_DIR,\\$HADOOP_COMMON_HOME/*,\\$HADOOP_COMMON_HOME/lib/*,\\$HADOOP_HDFS_HOME/*,
\\$HADOOP_HDFS_HOME/lib/*,\\$HADOOP_MAPRED_HOME/*,\\$HADOOP_MAPRED_HOME/lib/*,\\$HADOOP_YARN_HOME/*,
\\$HADOOP_YARN_HOME/lib/*
</value>
</property>
</configuration>
YRNCNF
# format namenode
if [ ! -e /var/lib/hadoop-hdfs/cache/hdfs ]; then
echo "Formatting HDFS..." \
&& sudo -u hdfs hdfs namenode -format -force &> /dev/null
fi
MYSQL_JDBC=mysql-connector-java-5.1.37
MYSQL_JDBC_SOURCE=http://dev.mysql.com/get/Downloads/Connector-J/${MYSQL_JDBC}.tar.gz
mkdir -p /usr/local/lib/jdbc/mysql \
&& echo "Downloading MySQL JDBC drivers" \
&& wget ${MYSQL_JDBC_SOURCE} -q -P /tmp/ \
&& echo "Installing MySQL JDBC drivers" \
&& tar zxf /tmp/${MYSQL_JDBC}.tar.gz -C /tmp/ \
&& cp /tmp/${MYSQL_JDBC}/mysql-connector-java*.jar /usr/lib/hive/lib/ \
&& cp /tmp/${MYSQL_JDBC}/mysql-connector-java*.jar /usr/local/lib/jdbc/mysql/
cat << HIVECNF > /etc/hive/conf/hive-site.xml
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://localhost:3306/hive?createDatabaseIfNotExist=true</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>hive</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>hive</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>hive.metastore.uris</name>
<value>thrift://cdh.instance.com:9083</value>
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>hdfs:///user/hive/warehouse</value>
</property>
</configuration>
HIVECNF
# auto-start services
chkconfig hadoop-hdfs-namenode on \
&& chkconfig hadoop-hdfs-datanode on \
&& chkconfig hadoop-yarn-resourcemanager on \
&& chkconfig hadoop-yarn-nodemanager on \
&& chkconfig hive-metastore on \
&& chkconfig hive-server2 on \
&& chkconfig oozie on
# start hadoop processes
if [ ! "$(ps aux | grep hdfs-namenode | wc -l)" == "2" ]; then
service hadoop-hdfs-namenode start
fi
if [ ! "$(ps aux | grep datanode | wc -l)" == "2" ]; then
service hadoop-hdfs-datanode start
fi
if [ ! "$(ps aux | grep resourcemanager | wc -l)" == "2" ]; then
service hadoop-yarn-resourcemanager start
fi
if [ ! "$(ps aux | grep nodemanager | wc -l)" == "2" ]; then
service hadoop-yarn-nodemanager start
fi
echo "Creating HDFS directory structure" \
&&sudo -u hdfs hdfs dfs -mkdir -p /user \
&& sudo -u hdfs hdfs dfs -chmod -R 777 /user \
&& sudo -u hdfs hdfs dfs -mkdir -p /user/spark \
&& sudo -u hdfs hdfs dfs -chmod -R 755 /user/spark \
&& sudo -u hdfs hdfs dfs -mkdir -p /tmp \
&& sudo -u hdfs hdfs dfs -chmod -R 777 /tmp \
&& sudo -u hdfs hdfs dfs -mkdir -p /user/hive/warehouse \
&& sudo -u hdfs hdfs dfs -chown -R hive:hive /user/hive/warehouse \
&& sudo -u hdfs hdfs dfs -chmod -R 755 /user/hive/warehouse
if [ ! "$(ps aux | grep HiveMetaStore | wc -l)" == "2" ]; then
service hive-metastore start
fi
if [ ! "$(ps aux | grep HiveServer2 | wc -l)" == "2" ]; then
service hive-server2 start
fi
SCRIPT
# OS configuration
$system_config = <<SCRIPT
DEV_USER=hadoop_oozie
DEV_PASSWORD=hadoop
PROXY_CONFIG=/etc/profile.d/proxy.sh
service iptables stop && chkconfig iptables off
if grep ryanair /etc/resolv.conf; then
echo "export http_proxy=http://internalproxy.corp.ryanair.com:3128" > ${PROXY_CONFIG} \
&& echo "export https_proxy=http://internalproxy.corp.ryanair.com:3128" >> ${PROXY_CONFIG}
else
rm -fR ${PROXY_CONFIG}
fi
# Add entries to /etc/hosts
ip=$(ifconfig eth1 | awk -v host=$(hostname) '/inet addr/ {print substr($2,6)}')
host=$(hostname)
echo "127.0.0.1 localhost" > /etc/hosts
echo "$ip $host" >> /etc/hosts
# Add a dev user - don't worry about the password
if ! grep ${DEV_USER} /etc/passwd; then
echo "Creating user ${DEV_USER}" && useradd -p $(openssl passwd -1 ${DEV_PASSWORD}) ${DEV_USER} \
&& echo "${DEV_USER} ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/hadoop_oozie
fi
SCRIPT
$information = <<SCRIPT
ip=$(ifconfig eth1 | awk -v host=$(hostname) '/inet addr/ {print substr($2,6)}')
echo "Guest IP address: $ip"
echo "Namenode's UI available at: http://$ip:50070"
echo "Resource Manager's UI available at: http://$ip:8088"
echo "MySQL root password: hadoop"
echo "You may want to add the below line to /etc/hosts:"
echo "$ip cdh.instance.com"
SCRIPT
Vagrant.configure(2) do |config|
config.vm.box = "boxcutter/centos66"
config.vm.hostname = "cdh.instance.com"
config.vm.network :public_network, :mac => "0800DEADBEEF"
config.vm.provider "virtualbox" do |vb|
vb.name = "cloudera-hadoop"
vb.cpus = 4
vb.memory = 8192
vb.customize ["modifyvm", :id, "--nicpromisc2", "allow-all"]
vb.customize ["modifyvm", :id, "--cpuexecutioncap", "100"]
end
config.vm.provision :shell, :name => "system_config", :inline => $system_config
config.vm.provision :shell, :name => "anaconda_deps", :inline => $anaconda_deps
config.vm.provision :shell, :name => "mysql_deps", :inline => $mysql_deps
config.vm.provision :shell, :name => "spark_deps", :inline => $spark_deps
config.vm.provision :shell, :name => "cloudera_deps", :inline => $cloudera_deps
config.vm.provision :shell, :name => "information", :inline => $information
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment