Skip to content

Instantly share code, notes, and snippets.

@bartekdobija
Last active October 30, 2015 14:52
Show Gist options
  • Save bartekdobija/6a95b5c7352e85d0d401 to your computer and use it in GitHub Desktop.
Save bartekdobija/6a95b5c7352e85d0d401 to your computer and use it in GitHub Desktop.
Cloudera Vagrantfile
$anaconda_deps = <<SCRIPT
ANACONDA_INSTALLER=https://3230d63b5fc54e62148e-c95ac804525aac4b6dba79b00b39d1d3.ssl.cf1.rackcdn.com/Anaconda-2.3.0-Linux-x86_64.sh
if [ ! -d "/usr/local/anaconda" ]; then
echo "Anaconda installation..." \
&& wget ${ANACONDA_INSTALLER} -q -P /tmp/ \
&& bash /tmp/Anaconda-2.3.0-Linux-x86_64.sh -b -f -p /usr/local/anaconda
fi
SCRIPT
$mysql_deps = <<SCRIPT
MYSQL_REPO=https://dev.mysql.com/get/mysql-community-release-el6-5.noarch.rpm
MY_CNF=/etc/my.cnf
DEV_PASSWORD=hadoop
[ ! -e /etc/yum.repos.d/mysql-community.repo ] && rpm -ivh ${MYSQL_REPO}
yum install -y mysql-community-server
if [ -e /etc/init.d/mysqld ] && [ -z "$(grep -R vagrant ${MY_CNF})" ]; then
echo "# InnoDB settings" >> ${MY_CNF}
echo "innodb_file_per_table = 1" >> ${MY_CNF}
echo "innodb_flush_log_at_trx_commit = 2" >> ${MY_CNF}
echo "innodb_log_buffer_size = 64M" >> ${MY_CNF}
echo "innodb_buffer_pool_size = 1G" >> ${MY_CNF}
echo "innodb_thread_concurrency = 8" >> ${MY_CNF}
echo "innodb_flush_method = O_DIRECT" >> ${MY_CNF}
echo "innodb_log_file_size = 512M" >> ${MY_CNF}
echo "explicit_defaults_for_timestamp = 1" >> ${MY_CNF}
chkconfig mysqld on \
&& service mysqld start \
&& /usr/bin/mysqladmin -u root password "${DEV_PASSWORD}" \
&& echo "# vagrant provisioned" >> ${MY_CNF}
fi
SCRIPT
$spark_deps = <<SCRIPT
SPARK_TGZ=spark-1.5.1-bin-without-hadoop.tgz
SPARK_LINK=/opt/spark
[ ! -e ${SPARK_LINK} ] \
&& echo "Spark installation..." \
&& wget http://ftp.heanet.ie/mirrors/www.apache.org/dist/spark/spark-1.5.1/${SPARK_TGZ} -q -P /opt/ \
&& tar zxf /opt/${SPARK_TGZ} -C /opt/ \
&& ln -s /opt/spark-1.5.1-bin-without-hadoop ${SPARK_LINK}
[ ! -e /opt/${SPARK_TGZ} ] && exit 1
echo "export PATH=\$PATH:${SPARK_LINK}/bin" > /etc/init.d/spark.sh
echo "HADOOP_CONF_DIR=/etc/hadoop/conf/" > /opt/spark/conf/spark-env.sh
echo "SPARK_DIST_CLASSPATH=$(hadoop classpath)" >> /opt/spark/conf/spark-env.sh
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/cloudera/parcels/CDH/lib/hadoop/lib/native/" >> ${SPARK_LINK}/conf/spark-env.sh
cat << SPCNF > ${SPARK_LINK}/conf/spark-defaults.conf
spark.yarn.jar hdfs:///user/spark/share/lib/spark-assembly-1.5.0-hadoop2.6.0.jar
spark.shuffle.service.enabled true
# Execution Behavior
spark.broadcast.blockSize 4096
# Dynamic Resource Allocation (YARN)
spark.dynamicAllocation.enabled true
spark.speculation true
spark.scheduler.mode FAIR
spark.kryoserializer.buffer.max 1000m
spark.driver.maxResultSize 0
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.yarn.preserve.staging.files false
spark.master yarn
spark.rdd.compress true
# Local execution of selected Spark functions
spark.localExecution.enabled true
spark.sql.parquet.binaryAsString true
spark.sql.parquet.compression.codec snappy
# use lz4 compression for broadcast variables as Snappy is not supported on MacOSX
spark.broadcast.compress true
spark.io.compression.codec lz4
spark.driver.extraLibraryPath /opt/cloudera/parcels/CDH/lib/hadoop/lib/native
spark.executor.extraLibraryPath /opt/cloudera/parcels/CDH/lib/hadoop/lib/native
spark.executor.extraJavaOptions -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+UseCompressedOops
spark.driver.extraJavaOptions -XX:+UseCompressedOops -XX:MaxPermSize=1g
spark.executor.extraClassPath /usr/local/lib/jdbc/sqlserver/*.jar:/usr/local/lib/jdbc/mysql/*.jar:/usr/local/anaconda/bin:/opt/udfs/hive/*.jar
spark.driver.extraClassPath /usr/local/lib/jdbc/sqlserver/*.jar:/usr/local/lib/jdbc/mysql/*.jar:/usr/local/anaconda/bin:/opt/udfs/hive/*.jar
SPCNF
echo "Add hive-site.xml configuration here !!!"
SCRIPT
$cloudera_deps = <<SCRIPT
CLOUDERA_REPO=http://archive.cloudera.com/cdh5/redhat/6/x86_64/cdh/cloudera-cdh5.repo
# Add Cloudera repository
[ ! -e /etc/yum.repos.d/cloudera-cdh5.repo ] \
&& wget ${CLOUDERA_REPO} -q -P /etc/yum.repos.d/
# Cloudera Hadoop installation
yum install -y java-1.7.0-openjdk java-1.7.0-openjdk-devel hadoop hadoop-conf-pseudo hadoop-hdfs-datanode hadoop-hdfs-journalnode hadoop-hdfs-namenode hadoop-hdfs-secondarynamenode hadoop-hdfs-zkfc hadoop-libhdfs-devel hadoop-mapreduce-historyserver hadoop-yarn-nodemanager hadoop-yarn-resourcemanager zookeeper zookeeper-native zookeeper-server oozie oozie-client kite sqoop hive hive-metastore hive-server2 hive-hcatalog hive-jdbc avro-libs pig kite impala*
SCRIPT
$system_config = <<SCRIPT
DEV_USER=hadoop_oozie
DEV_PASSWORD=hadoop
PROXY_CONFIG=/etc/profile.d/proxy.sh
service iptables stop && chkconfig iptables off
if grep ryanair /etc/resolv.conf; then
echo "export http_proxy=http://internalproxy.corp.ryanair.com:3128" > ${PROXY_CONFIG} \
&& echo "export https_proxy=http://internalproxy.corp.ryanair.com:3128" >> ${PROXY_CONFIG}
else
rm -fR ${PROXY_CONFIG}
fi
# Add a dev user - don't worry about the password
if ! grep ${DEV_USER} /etc/passwd; then
echo "Creating user ${DEV_USER}" && useradd -p $(openssl passwd -1 ${DEV_PASSWORD}) ${DEV_USER} \
&& echo "${DEV_USER} ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/hadoop_oozie
fi
SCRIPT
Vagrant.configure(2) do |config|
config.vm.box = "boxcutter/centos66"
config.vm.hostname = "cdh.instance.com"
config.vm.network :public_network, :bridge => "en3: Thunderbolt Ethernet", :mac => "0800DEADBEEF"
config.vm.provider "virtualbox" do |vb|
vb.name = "cloudera-hadoop"
vb.cpus = 4
vb.memory = 8192
vb.customize ["modifyvm", :id, "--nicpromisc2", "allow-all"]
vb.customize ["modifyvm", :id, "--cpuexecutioncap", "100"]
end
config.vm.provision :shell, :name => "system_config", :inline => $system_config
config.vm.provision :shell, :name => "anaconda_deps", :inline => $anaconda_deps
config.vm.provision :shell, :name => "mysql_deps", :inline => $mysql_deps
config.vm.provision :shell, :name => "spark_deps", :inline => $spark_deps
config.vm.provision :shell, :name => "cloudera_deps", :inline => $cloudera_deps
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment