Last active
October 30, 2015 14:52
-
-
Save bartekdobija/6a95b5c7352e85d0d401 to your computer and use it in GitHub Desktop.
Cloudera Vagrantfile
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| $anaconda_deps = <<SCRIPT | |
| ANACONDA_INSTALLER=https://3230d63b5fc54e62148e-c95ac804525aac4b6dba79b00b39d1d3.ssl.cf1.rackcdn.com/Anaconda-2.3.0-Linux-x86_64.sh | |
| if [ ! -d "/usr/local/anaconda" ]; then | |
| echo "Anaconda installation..." \ | |
| && wget ${ANACONDA_INSTALLER} -q -P /tmp/ \ | |
| && bash /tmp/Anaconda-2.3.0-Linux-x86_64.sh -b -f -p /usr/local/anaconda | |
| fi | |
| SCRIPT | |
| $mysql_deps = <<SCRIPT | |
| MYSQL_REPO=https://dev.mysql.com/get/mysql-community-release-el6-5.noarch.rpm | |
| MY_CNF=/etc/my.cnf | |
| DEV_PASSWORD=hadoop | |
| [ ! -e /etc/yum.repos.d/mysql-community.repo ] && rpm -ivh ${MYSQL_REPO} | |
| yum install -y mysql-community-server | |
| if [ -e /etc/init.d/mysqld ] && [ -z "$(grep -R vagrant ${MY_CNF})" ]; then | |
| echo "# InnoDB settings" >> ${MY_CNF} | |
| echo "innodb_file_per_table = 1" >> ${MY_CNF} | |
| echo "innodb_flush_log_at_trx_commit = 2" >> ${MY_CNF} | |
| echo "innodb_log_buffer_size = 64M" >> ${MY_CNF} | |
| echo "innodb_buffer_pool_size = 1G" >> ${MY_CNF} | |
| echo "innodb_thread_concurrency = 8" >> ${MY_CNF} | |
| echo "innodb_flush_method = O_DIRECT" >> ${MY_CNF} | |
| echo "innodb_log_file_size = 512M" >> ${MY_CNF} | |
| echo "explicit_defaults_for_timestamp = 1" >> ${MY_CNF} | |
| chkconfig mysqld on \ | |
| && service mysqld start \ | |
| && /usr/bin/mysqladmin -u root password "${DEV_PASSWORD}" \ | |
| && echo "# vagrant provisioned" >> ${MY_CNF} | |
| fi | |
| SCRIPT | |
| $spark_deps = <<SCRIPT | |
| SPARK_TGZ=spark-1.5.1-bin-without-hadoop.tgz | |
| SPARK_LINK=/opt/spark | |
| [ ! -e ${SPARK_LINK} ] \ | |
| && echo "Spark installation..." \ | |
| && wget http://ftp.heanet.ie/mirrors/www.apache.org/dist/spark/spark-1.5.1/${SPARK_TGZ} -q -P /opt/ \ | |
| && tar zxf /opt/${SPARK_TGZ} -C /opt/ \ | |
| && ln -s /opt/spark-1.5.1-bin-without-hadoop ${SPARK_LINK} | |
| [ ! -e /opt/${SPARK_TGZ} ] && exit 1 | |
| echo "export PATH=\$PATH:${SPARK_LINK}/bin" > /etc/init.d/spark.sh | |
| echo "HADOOP_CONF_DIR=/etc/hadoop/conf/" > /opt/spark/conf/spark-env.sh | |
| echo "SPARK_DIST_CLASSPATH=$(hadoop classpath)" >> /opt/spark/conf/spark-env.sh | |
| echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/cloudera/parcels/CDH/lib/hadoop/lib/native/" >> ${SPARK_LINK}/conf/spark-env.sh | |
| cat << SPCNF > ${SPARK_LINK}/conf/spark-defaults.conf | |
| spark.yarn.jar hdfs:///user/spark/share/lib/spark-assembly-1.5.0-hadoop2.6.0.jar | |
| spark.shuffle.service.enabled true | |
| # Execution Behavior | |
| spark.broadcast.blockSize 4096 | |
| # Dynamic Resource Allocation (YARN) | |
| spark.dynamicAllocation.enabled true | |
| spark.speculation true | |
| spark.scheduler.mode FAIR | |
| spark.kryoserializer.buffer.max 1000m | |
| spark.driver.maxResultSize 0 | |
| spark.serializer org.apache.spark.serializer.KryoSerializer | |
| spark.yarn.preserve.staging.files false | |
| spark.master yarn | |
| spark.rdd.compress true | |
| # Local execution of selected Spark functions | |
| spark.localExecution.enabled true | |
| spark.sql.parquet.binaryAsString true | |
| spark.sql.parquet.compression.codec snappy | |
| # use lz4 compression for broadcast variables as Snappy is not supported on MacOSX | |
| spark.broadcast.compress true | |
| spark.io.compression.codec lz4 | |
| spark.driver.extraLibraryPath /opt/cloudera/parcels/CDH/lib/hadoop/lib/native | |
| spark.executor.extraLibraryPath /opt/cloudera/parcels/CDH/lib/hadoop/lib/native | |
| spark.executor.extraJavaOptions -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+UseCompressedOops | |
| spark.driver.extraJavaOptions -XX:+UseCompressedOops -XX:MaxPermSize=1g | |
| spark.executor.extraClassPath /usr/local/lib/jdbc/sqlserver/*.jar:/usr/local/lib/jdbc/mysql/*.jar:/usr/local/anaconda/bin:/opt/udfs/hive/*.jar | |
| spark.driver.extraClassPath /usr/local/lib/jdbc/sqlserver/*.jar:/usr/local/lib/jdbc/mysql/*.jar:/usr/local/anaconda/bin:/opt/udfs/hive/*.jar | |
| SPCNF | |
| echo "Add hive-site.xml configuration here !!!" | |
| SCRIPT | |
| $cloudera_deps = <<SCRIPT | |
| CLOUDERA_REPO=http://archive.cloudera.com/cdh5/redhat/6/x86_64/cdh/cloudera-cdh5.repo | |
| # Add Cloudera repository | |
| [ ! -e /etc/yum.repos.d/cloudera-cdh5.repo ] \ | |
| && wget ${CLOUDERA_REPO} -q -P /etc/yum.repos.d/ | |
| # Cloudera Hadoop installation | |
| yum install -y java-1.7.0-openjdk java-1.7.0-openjdk-devel hadoop hadoop-conf-pseudo hadoop-hdfs-datanode hadoop-hdfs-journalnode hadoop-hdfs-namenode hadoop-hdfs-secondarynamenode hadoop-hdfs-zkfc hadoop-libhdfs-devel hadoop-mapreduce-historyserver hadoop-yarn-nodemanager hadoop-yarn-resourcemanager zookeeper zookeeper-native zookeeper-server oozie oozie-client kite sqoop hive hive-metastore hive-server2 hive-hcatalog hive-jdbc avro-libs pig kite impala* | |
| SCRIPT | |
| $system_config = <<SCRIPT | |
| DEV_USER=hadoop_oozie | |
| DEV_PASSWORD=hadoop | |
| PROXY_CONFIG=/etc/profile.d/proxy.sh | |
| service iptables stop && chkconfig iptables off | |
| if grep ryanair /etc/resolv.conf; then | |
| echo "export http_proxy=http://internalproxy.corp.ryanair.com:3128" > ${PROXY_CONFIG} \ | |
| && echo "export https_proxy=http://internalproxy.corp.ryanair.com:3128" >> ${PROXY_CONFIG} | |
| else | |
| rm -fR ${PROXY_CONFIG} | |
| fi | |
| # Add a dev user - don't worry about the password | |
| if ! grep ${DEV_USER} /etc/passwd; then | |
| echo "Creating user ${DEV_USER}" && useradd -p $(openssl passwd -1 ${DEV_PASSWORD}) ${DEV_USER} \ | |
| && echo "${DEV_USER} ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/hadoop_oozie | |
| fi | |
| SCRIPT | |
| Vagrant.configure(2) do |config| | |
| config.vm.box = "boxcutter/centos66" | |
| config.vm.hostname = "cdh.instance.com" | |
| config.vm.network :public_network, :bridge => "en3: Thunderbolt Ethernet", :mac => "0800DEADBEEF" | |
| config.vm.provider "virtualbox" do |vb| | |
| vb.name = "cloudera-hadoop" | |
| vb.cpus = 4 | |
| vb.memory = 8192 | |
| vb.customize ["modifyvm", :id, "--nicpromisc2", "allow-all"] | |
| vb.customize ["modifyvm", :id, "--cpuexecutioncap", "100"] | |
| end | |
| config.vm.provision :shell, :name => "system_config", :inline => $system_config | |
| config.vm.provision :shell, :name => "anaconda_deps", :inline => $anaconda_deps | |
| config.vm.provision :shell, :name => "mysql_deps", :inline => $mysql_deps | |
| config.vm.provision :shell, :name => "spark_deps", :inline => $spark_deps | |
| config.vm.provision :shell, :name => "cloudera_deps", :inline => $cloudera_deps | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment