Last active
October 30, 2015 14:52
-
-
Save bartekdobija/6a95b5c7352e85d0d401 to your computer and use it in GitHub Desktop.
Cloudera Vagrantfile
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Anaconda dependencies | |
| $anaconda_deps = <<SCRIPT | |
| ANACONDA_INSTALLER=https://3230d63b5fc54e62148e-c95ac804525aac4b6dba79b00b39d1d3.ssl.cf1.rackcdn.com/Anaconda-2.3.0-Linux-x86_64.sh | |
| if [ ! -d "/usr/local/anaconda" ]; then | |
| echo "Anaconda installation..." \ | |
| && echo "getting binaries" \ | |
| && wget ${ANACONDA_INSTALLER} -q -P /tmp/ \ | |
| && echo "running installer" \ | |
| && bash /tmp/Anaconda-2.3.0-Linux-x86_64.sh -b -f -p /usr/local/anaconda | |
| fi | |
| SCRIPT | |
| # MySQL dependencies | |
| $mysql_deps = <<SCRIPT | |
| MYSQL_REPO=https://dev.mysql.com/get/mysql-community-release-el6-5.noarch.rpm | |
| MY_CNF=/etc/my.cnf | |
| DEV_PASSWORD=hadoop | |
| [ ! -e /etc/yum.repos.d/mysql-community.repo ] && rpm -ivh ${MYSQL_REPO} | |
| yum install -y mysql-community-server mysql-connector-java | |
| if [ -e /etc/init.d/mysqld ] && [ -z "$(grep -R vagrant ${MY_CNF})" ]; then | |
| echo "# InnoDB settings" >> ${MY_CNF} | |
| echo "default_storage_engine = innodb" >> ${MY_CNF} | |
| echo "innodb_file_per_table = 1" >> ${MY_CNF} | |
| echo "innodb_flush_log_at_trx_commit = 2" >> ${MY_CNF} | |
| echo "innodb_log_buffer_size = 64M" >> ${MY_CNF} | |
| echo "innodb_buffer_pool_size = 1G" >> ${MY_CNF} | |
| echo "innodb_thread_concurrency = 8" >> ${MY_CNF} | |
| echo "innodb_flush_method = O_DIRECT" >> ${MY_CNF} | |
| echo "innodb_log_file_size = 512M" >> ${MY_CNF} | |
| echo "explicit_defaults_for_timestamp = 1" >> ${MY_CNF} | |
| chkconfig mysqld on \ | |
| && service mysqld start \ | |
| && /usr/bin/mysqladmin -u root password "${DEV_PASSWORD}" &> /dev/null \ | |
| && echo "# vagrant provisioned" >> ${MY_CNF} | |
| fi | |
| SCRIPT | |
| # Spark dependencies | |
| $spark_deps = <<SCRIPT | |
| SPARK_TGZ=spark-1.5.1-bin-without-hadoop.tgz | |
| SPARK_LINK=/opt/spark | |
| [ ! -e ${SPARK_LINK} ] \ | |
| && echo "Spark installation..." \ | |
| && wget http://ftp.heanet.ie/mirrors/www.apache.org/dist/spark/spark-1.5.1/${SPARK_TGZ} -q -P /opt/ \ | |
| && tar zxf /opt/${SPARK_TGZ} -C /opt/ \ | |
| && ln -s /opt/spark-1.5.1-bin-without-hadoop ${SPARK_LINK} | |
| [ ! -e /opt/${SPARK_TGZ} ] && exit 1 | |
| echo "Spark configuration..." | |
| echo "configuring /etc/profile.d/spark.sh" | |
| echo 'export PATH=$PATH'":${SPARK_LINK}/bin" > /etc/profile.d/spark.sh | |
| echo "configuring /opt/spark/conf/spark-env.sh" | |
| cat << SPCNF > /opt/spark/conf/spark-env.sh | |
| HADOOP_CONF_DIR=/etc/hadoop/conf/ | |
| SPARK_DIST_CLASSPATH=\\$(hadoop classpath) | |
| LD_LIBRARY_PATH=\\${LD_LIBRARY_PATH}:/opt/cloudera/parcels/CDH/lib/hadoop/lib/native/ | |
| SPCNF | |
| echo "configuring ${SPARK_LINK}/conf/spark-defaults.conf" | |
| cat << SPCNF > ${SPARK_LINK}/conf/spark-defaults.conf | |
| spark.shuffle.service.enabled true | |
| # Execution Behavior | |
| spark.broadcast.blockSize 4096 | |
| # Dynamic Resource Allocation (YARN) | |
| spark.dynamicAllocation.enabled true | |
| spark.speculation true | |
| spark.scheduler.mode FAIR | |
| spark.kryoserializer.buffer.max 1000m | |
| spark.driver.maxResultSize 0 | |
| spark.serializer org.apache.spark.serializer.KryoSerializer | |
| spark.yarn.preserve.staging.files false | |
| spark.master yarn | |
| spark.rdd.compress true | |
| # Local execution of selected Spark functions | |
| spark.localExecution.enabled true | |
| spark.sql.parquet.binaryAsString true | |
| spark.sql.parquet.compression.codec snappy | |
| # use lz4 compression for broadcast variables as Snappy is not supported on MacOSX | |
| spark.broadcast.compress true | |
| spark.io.compression.codec lz4 | |
| spark.driver.extraLibraryPath /opt/cloudera/parcels/CDH/lib/hadoop/lib/native | |
| spark.executor.extraLibraryPath /opt/cloudera/parcels/CDH/lib/hadoop/lib/native | |
| spark.executor.extraJavaOptions -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+UseCompressedOops | |
| spark.driver.extraJavaOptions -XX:+UseCompressedOops -XX:MaxPermSize=1g | |
| spark.executor.extraClassPath /usr/local/lib/jdbc/sqlserver/*.jar:/usr/local/lib/jdbc/mysql/*.jar:/usr/local/anaconda/bin:/opt/udfs/hive/*.jar | |
| spark.driver.extraClassPath /usr/local/lib/jdbc/sqlserver/*.jar:/usr/local/lib/jdbc/mysql/*.jar:/usr/local/anaconda/bin:/opt/udfs/hive/*.jar | |
| SPCNF | |
| echo "Add hive-site.xml configuration here !!!" | |
| SCRIPT | |
| # Cloudera CDH dependencies | |
| $cloudera_deps = <<SCRIPT | |
| CLOUDERA_REPO=http://archive.cloudera.com/cdh5/redhat/6/x86_64/cdh/cloudera-cdh5.repo | |
| # Add Cloudera repository | |
| [ ! -e /etc/yum.repos.d/cloudera-cdh5.repo ] \ | |
| && wget ${CLOUDERA_REPO} -q -P /etc/yum.repos.d/ | |
| # Cloudera Hadoop installation | |
| yum install -y java-1.7.0-openjdk java-1.7.0-openjdk-devel hadoop \ | |
| hadoop-conf-pseudo hadoop-hdfs-datanode hadoop-hdfs-journalnode \ | |
| hadoop-hdfs-namenode hadoop-hdfs-secondarynamenode hadoop-hdfs-zkfc \ | |
| hadoop-libhdfs-devel hadoop-mapreduce-historyserver hadoop-yarn-nodemanager \ | |
| hadoop-yarn-resourcemanager zookeeper zookeeper-native zookeeper-server \ | |
| oozie oozie-client kite sqoop hive hive-metastore hive-server2 hive-hcatalog \ | |
| hive-jdbc avro-libs pig kite impala* | |
| # format namenode | |
| if [ ! -e /var/lib/hadoop-hdfs/cache/hdfs ]; then | |
| echo "Formatting HDFS..." \ | |
| && sudo -u hdfs hdfs namenode -format -force &> /dev/null | |
| fi | |
| # auto-start services | |
| chkconfig hadoop-hdfs-namenode on \ | |
| && chkconfig hadoop-hdfs-datanode on \ | |
| && chkconfig hadoop-yarn-resourcemanager on \ | |
| && chkconfig hadoop-yarn-nodemanager on \ | |
| && chkconfig hive-metastore on \ | |
| && chkconfig hive-server2 on \ | |
| && chkconfig oozie on | |
| # start hadoop processes | |
| if [ ! "$(ps aux | grep namenode | wc -l)" == "2" ]; then | |
| service hadoop-hdfs-namenode start | |
| fi | |
| if [ ! "$(ps aux | grep datanode | wc -l)" == "2" ]; then | |
| service hadoop-hdfs-datanode start | |
| fi | |
| if [ ! "$(ps aux | grep resourcemanager | wc -l)" == "2" ]; then | |
| service hadoop-yarn-resourcemanager start | |
| fi | |
| if [ ! "$(ps aux | grep nodemanager | wc -l)" == "2" ]; then | |
| service hadoop-yarn-nodemanager start | |
| fi | |
| echo "Creating HDFS directory structure" \ | |
| &&sudo -u hdfs hdfs dfs -mkdir -p /user \ | |
| && sudo -u hdfs hdfs dfs -chmod -R 777 /user \ | |
| && sudo -u hdfs hdfs dfs -mkdir -p /tmp \ | |
| && sudo -u hdfs hdfs dfs -chmod -R 777 /tmp | |
| SCRIPT | |
| # OS configuration | |
| $system_config = <<SCRIPT | |
| DEV_USER=hadoop_oozie | |
| DEV_PASSWORD=hadoop | |
| PROXY_CONFIG=/etc/profile.d/proxy.sh | |
| service iptables stop && chkconfig iptables off | |
| if grep ryanair /etc/resolv.conf; then | |
| echo "export http_proxy=http://internalproxy.corp.ryanair.com:3128" > ${PROXY_CONFIG} \ | |
| && echo "export https_proxy=http://internalproxy.corp.ryanair.com:3128" >> ${PROXY_CONFIG} | |
| else | |
| rm -fR ${PROXY_CONFIG} | |
| fi | |
| # Add entries to /etc/hosts | |
| ip=$(ifconfig eth1 | awk -v host=$(hostname) '/inet addr/ {print substr($2,6)}') | |
| host=$(hostname) | |
| echo "127.0.0.1 localhost" > /etc/hosts | |
| echo "$ip $host" >> /etc/hosts | |
| # Add a dev user - don't worry about the password | |
| if ! grep ${DEV_USER} /etc/passwd; then | |
| echo "Creating user ${DEV_USER}" && useradd -p $(openssl passwd -1 ${DEV_PASSWORD}) ${DEV_USER} \ | |
| && echo "${DEV_USER} ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/hadoop_oozie | |
| fi | |
| SCRIPT | |
| $information = <<SCRIPT | |
| ip=$(ifconfig eth1 | awk -v host=$(hostname) '/inet addr/ {print substr($2,6)}') | |
| echo "Guest IP address: $ip" | |
| echo "Namenode's UI available at: http://$ip:50070" | |
| echo "Resource Manager's UI available at: http://$ip:8088" | |
| echo "MySQL root password: hadoop" | |
| echo "You may want to add the below line to /etc/hosts:" | |
| echo "$ip cdh.instance.com" | |
| SCRIPT | |
| Vagrant.configure(2) do |config| | |
| config.vm.box = "boxcutter/centos66" | |
| config.vm.hostname = "cdh.instance.com" | |
| config.vm.network :public_network, :bridge => "en3: Thunderbolt Ethernet", :mac => "0800DEADBEEF" | |
| config.vm.provider "virtualbox" do |vb| | |
| vb.name = "cloudera-hadoop" | |
| vb.cpus = 4 | |
| vb.memory = 8192 | |
| vb.customize ["modifyvm", :id, "--nicpromisc2", "allow-all"] | |
| vb.customize ["modifyvm", :id, "--cpuexecutioncap", "100"] | |
| end | |
| config.vm.provision :shell, :name => "system_config", :inline => $system_config | |
| config.vm.provision :shell, :name => "anaconda_deps", :inline => $anaconda_deps | |
| config.vm.provision :shell, :name => "mysql_deps", :inline => $mysql_deps | |
| config.vm.provision :shell, :name => "spark_deps", :inline => $spark_deps | |
| config.vm.provision :shell, :name => "cloudera_deps", :inline => $cloudera_deps | |
| config.vm.provision :shell, :name => "information", :inline => $information | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment