Last active
October 30, 2015 14:52
-
-
Save bartekdobija/6a95b5c7352e85d0d401 to your computer and use it in GitHub Desktop.
Cloudera Vagrantfile
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Anaconda dependencies | |
| $anaconda_deps = <<SCRIPT | |
| ANACONDA_INSTALLER=https://3230d63b5fc54e62148e-c95ac804525aac4b6dba79b00b39d1d3.ssl.cf1.rackcdn.com/Anaconda-2.3.0-Linux-x86_64.sh | |
| if [ ! -d "/usr/local/anaconda" ]; then | |
| echo "Anaconda installation..." \ | |
| && echo "downloading binaries" \ | |
| && wget ${ANACONDA_INSTALLER} -q -P /tmp/ \ | |
| && echo "running installer" \ | |
| && bash /tmp/Anaconda-2.3.0-Linux-x86_64.sh -b -f -p /usr/local/anaconda | |
| fi | |
| SCRIPT | |
| # MySQL dependencies | |
| $mysql_deps = <<SCRIPT | |
| MYSQL_REPO=https://dev.mysql.com/get/mysql-community-release-el6-5.noarch.rpm | |
| MY_CNF=/etc/my.cnf | |
| DEV_PASSWORD=hadoop | |
| [ ! -e /etc/yum.repos.d/mysql-community.repo ] && rpm -ivh ${MYSQL_REPO} | |
| yum install -y mysql-community-server | |
| if [ -e /etc/init.d/mysqld ] && [ -z "$(grep -R vagrant ${MY_CNF})" ]; then | |
| echo "# InnoDB settings" >> ${MY_CNF} | |
| echo "default_storage_engine = innodb" >> ${MY_CNF} | |
| echo "innodb_file_per_table = 1" >> ${MY_CNF} | |
| echo "innodb_flush_log_at_trx_commit = 2" >> ${MY_CNF} | |
| echo "innodb_log_buffer_size = 64M" >> ${MY_CNF} | |
| echo "innodb_buffer_pool_size = 1G" >> ${MY_CNF} | |
| echo "innodb_thread_concurrency = 8" >> ${MY_CNF} | |
| echo "innodb_flush_method = O_DIRECT" >> ${MY_CNF} | |
| echo "innodb_log_file_size = 512M" >> ${MY_CNF} | |
| echo "explicit_defaults_for_timestamp = 1" >> ${MY_CNF} | |
| chkconfig mysqld on \ | |
| && service mysqld start \ | |
| && /usr/bin/mysqladmin -u root password "${DEV_PASSWORD}" &> /dev/null \ | |
| && echo "# vagrant provisioned" >> ${MY_CNF} | |
| mysql -u root -p${DEV_PASSWORD} \ | |
| -e "create schema if not exists hive; grant all on hive.* to 'hive'@'localhost' identified by 'hive'" | |
| fi | |
| SCRIPT | |
| # Spark dependencies | |
| $spark_deps = <<SCRIPT | |
| SPARK_TGZ=spark-1.5.1-bin-without-hadoop.tgz | |
| SPARK_LINK=/opt/spark | |
| [ ! -e ${SPARK_LINK} ] \ | |
| && echo "Spark installation..." \ | |
| && echo "downloading binaries" \ | |
| && wget http://ftp.heanet.ie/mirrors/www.apache.org/dist/spark/spark-1.5.1/${SPARK_TGZ} -q -P /opt/ \ | |
| && tar zxf /opt/${SPARK_TGZ} -C /opt/ \ | |
| && ln -s /opt/spark-1.5.1-bin-without-hadoop ${SPARK_LINK} | |
| [ ! -e /opt/${SPARK_TGZ} ] && exit 1 | |
| echo "Spark configuration..." | |
| echo "configuring /etc/profile.d/spark.sh" | |
| echo 'export PATH=$PATH'":${SPARK_LINK}/bin" > /etc/profile.d/spark.sh | |
| echo "configuring /opt/spark/conf/spark-env.sh" | |
| cat << SPCNF > /opt/spark/conf/spark-env.sh | |
| HADOOP_CONF_DIR=/etc/hadoop/conf/ | |
| SPARK_DIST_CLASSPATH=\\$(hadoop classpath) | |
| LD_LIBRARY_PATH=\\${LD_LIBRARY_PATH}:/opt/cloudera/parcels/CDH/lib/hadoop/lib/native/ | |
| SPCNF | |
| echo "configuring ${SPARK_LINK}/conf/spark-defaults.conf" | |
| cat << SPCNF > ${SPARK_LINK}/conf/spark-defaults.conf | |
| spark.shuffle.service.enabled true | |
| # Execution Behavior | |
| spark.broadcast.blockSize 4096 | |
| # Dynamic Resource Allocation (YARN) | |
| spark.dynamicAllocation.enabled true | |
| spark.speculation true | |
| spark.scheduler.mode FAIR | |
| spark.kryoserializer.buffer.max 1000m | |
| spark.driver.maxResultSize 0 | |
| spark.serializer org.apache.spark.serializer.KryoSerializer | |
| spark.yarn.preserve.staging.files false | |
| spark.master yarn | |
| spark.rdd.compress true | |
| # Local execution of selected Spark functions | |
| spark.localExecution.enabled true | |
| spark.sql.parquet.binaryAsString true | |
| spark.sql.parquet.compression.codec snappy | |
| # use lz4 compression for broadcast variables as Snappy is not supported on MacOSX | |
| spark.broadcast.compress true | |
| spark.io.compression.codec lz4 | |
| spark.driver.extraLibraryPath /opt/cloudera/parcels/CDH/lib/hadoop/lib/native | |
| spark.executor.extraLibraryPath /opt/cloudera/parcels/CDH/lib/hadoop/lib/native | |
| spark.executor.extraJavaOptions -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+UseCompressedOops | |
| spark.driver.extraJavaOptions -XX:+UseCompressedOops -XX:MaxPermSize=1g | |
| spark.executor.extraClassPath /usr/local/lib/jdbc/sqlserver/*.jar:/usr/local/lib/jdbc/mysql/*.jar:/usr/local/anaconda/bin:/opt/udfs/hive/*.jar | |
| spark.driver.extraClassPath /usr/local/lib/jdbc/sqlserver/*.jar:/usr/local/lib/jdbc/mysql/*.jar:/usr/local/anaconda/bin:/opt/udfs/hive/*.jar | |
| SPCNF | |
| echo "configuring ${SPARK_LINK}/conf/hive-site.xml" | |
| cat << HIVECNF > ${SPARK_LINK}/conf/hive-site.xml | |
| <configuration> | |
| <property> | |
| <name>javax.jdo.option.ConnectionURL</name> | |
| <value>jdbc:mysql://localhost:3306/hive?createDatabaseIfNotExist=true</value> | |
| </property> | |
| <property> | |
| <name>javax.jdo.option.ConnectionUserName</name> | |
| <value>hive</value> | |
| </property> | |
| <property> | |
| <name>javax.jdo.option.ConnectionPassword</name> | |
| <value>hive</value> | |
| </property> | |
| <property> | |
| <name>javax.jdo.option.ConnectionDriverName</name> | |
| <value>com.mysql.jdbc.Driver</value> | |
| </property> | |
| <property> | |
| <name>hive.metastore.uris</name> | |
| <value>thrift://cdh.instance.com:9083</value> | |
| </property> | |
| <property> | |
| <name>hive.metastore.warehouse.dir</name> | |
| <value>hdfs:///user/hive/warehouse</value> | |
| </property> | |
| </configuration> | |
| HIVECNF | |
| echo "installing resource scheduler" \ | |
| && mkdir -p /usr/lib/hadoop-yarn/lib/ \ | |
| && cp -f ${SPARK_LINK}/lib/spark-*-yarn-shuffle.jar /usr/lib/hadoop-yarn/lib/ | |
| SCRIPT | |
| # Cloudera CDH dependencies | |
| $cloudera_deps = <<SCRIPT | |
| CLOUDERA_REPO=http://archive.cloudera.com/cdh5/redhat/6/x86_64/cdh/cloudera-cdh5.repo | |
| # Add Cloudera repository | |
| [ ! -e /etc/yum.repos.d/cloudera-cdh5.repo ] \ | |
| && wget ${CLOUDERA_REPO} -q -P /etc/yum.repos.d/ | |
| # Cloudera Hadoop installation | |
| yum install -y java-1.7.0-openjdk java-1.7.0-openjdk-devel hadoop \ | |
| hadoop-conf-pseudo hadoop-hdfs-datanode hadoop-hdfs-journalnode \ | |
| hadoop-hdfs-namenode hadoop-hdfs-secondarynamenode hadoop-hdfs-zkfc \ | |
| hadoop-libhdfs-devel hadoop-mapreduce-historyserver hadoop-yarn-nodemanager \ | |
| hadoop-yarn-resourcemanager zookeeper zookeeper-native zookeeper-server \ | |
| oozie oozie-client kite sqoop hive hive-metastore hive-server2 hive-hcatalog \ | |
| hive-jdbc avro-libs pig kite impala* | |
| cat << HDPCNF > /etc/hadoop/conf/mapred-site.xml | |
| <configuration> | |
| <property> | |
| <name>mapred.job.tracker</name> | |
| <value>localhost:8021</value> | |
| </property> | |
| <property> | |
| <name>mapreduce.framework.name</name> | |
| <value>yarn</value> | |
| </property> | |
| <property> | |
| <name>mapreduce.jobhistory.address</name> | |
| <value>localhost:10020</value> | |
| </property> | |
| <property> | |
| <name>mapreduce.jobhistory.webapp.address</name> | |
| <value>localhost:19888</value> | |
| </property> | |
| <property> | |
| <description>To set the value of tmp directory for map and reduce tasks.</description> | |
| <name>mapreduce.task.tmp.dir</name> | |
| <value>/var/lib/hadoop-mapreduce/cache/\\${user.name}/tasks</value> | |
| </property> | |
| <property> | |
| <name>mapreduce.map.memory.mb</name> | |
| <value>512</value> | |
| </property> | |
| <property> | |
| <name>mapreduce.reduce.memory.mb</name> | |
| <value>512</value> | |
| </property> | |
| </configuration> | |
| HDPCNF | |
| cat << YRNCNF > /etc/hadoop/conf/yarn-site.xml | |
| <configuration> | |
| <property> | |
| <name>yarn.nodemanager.aux-services</name> | |
| <value>mapreduce_shuffle,spark_shuffle</value> | |
| </property> | |
| <property> | |
| <name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name> | |
| <value>org.apache.hadoop.mapred.ShuffleHandler</value> | |
| </property> | |
| <property> | |
| <name>yarn.nodemanager.aux-services.spark_shuffle.class</name> | |
| <value>org.apache.spark.network.yarn.YarnShuffleService</value> | |
| </property> | |
| <property> | |
| <name>yarn.log-aggregation-enable</name> | |
| <value>true</value> | |
| </property> | |
| <property> | |
| <name>yarn.dispatcher.exit-on-error</name> | |
| <value>true</value> | |
| </property> | |
| <property> | |
| <description>List of directories to store localized files in.</description> | |
| <name>yarn.nodemanager.local-dirs</name> | |
| <value>/var/lib/hadoop-yarn/cache/\\${user.name}/nm-local-dir</value> | |
| </property> | |
| <property> | |
| <description>Where to store container logs.</description> | |
| <name>yarn.nodemanager.log-dirs</name> | |
| <value>/var/log/hadoop-yarn/containers</value> | |
| </property> | |
| <property> | |
| <description>Where to aggregate logs to.</description> | |
| <name>yarn.nodemanager.remote-app-log-dir</name> | |
| <value>/var/log/hadoop-yarn/apps</value> | |
| </property> | |
| <property> | |
| <description>Classpath for typical applications.</description> | |
| <name>yarn.application.classpath</name> | |
| <value>\\$HADOOP_CONF_DIR,\\$HADOOP_COMMON_HOME/*,\\$HADOOP_COMMON_HOME/lib/*,\\$HADOOP_HDFS_HOME/*, | |
| \\$HADOOP_HDFS_HOME/lib/*,\\$HADOOP_MAPRED_HOME/*,\\$HADOOP_MAPRED_HOME/lib/*,\\$HADOOP_YARN_HOME/*, | |
| \\$HADOOP_YARN_HOME/lib/* | |
| </value> | |
| </property> | |
| </configuration> | |
| YRNCNF | |
| # format namenode | |
| if [ ! -e /var/lib/hadoop-hdfs/cache/hdfs ]; then | |
| echo "Formatting HDFS..." \ | |
| && sudo -u hdfs hdfs namenode -format -force &> /dev/null | |
| fi | |
| MYSQL_JDBC=mysql-connector-java-5.1.37 | |
| MYSQL_JDBC_SOURCE=http://dev.mysql.com/get/Downloads/Connector-J/${MYSQL_JDBC}.tar.gz | |
| mkdir -p /usr/local/lib/jdbc/mysql \ | |
| && echo "Downloading MySQL JDBC drivers" \ | |
| && wget ${MYSQL_JDBC_SOURCE} -q -P /tmp/ \ | |
| && echo "Installing MySQL JDBC drivers" \ | |
| && tar zxf /tmp/${MYSQL_JDBC}.tar.gz -C /tmp/ \ | |
| && cp /tmp/${MYSQL_JDBC}/mysql-connector-java*.jar /usr/lib/hive/lib/ \ | |
| && cp /tmp/${MYSQL_JDBC}/mysql-connector-java*.jar /usr/local/lib/jdbc/mysql/ | |
| cat << HIVECNF > /etc/hive/conf/hive-site.xml | |
| <configuration> | |
| <property> | |
| <name>javax.jdo.option.ConnectionURL</name> | |
| <value>jdbc:mysql://localhost:3306/hive?createDatabaseIfNotExist=true</value> | |
| </property> | |
| <property> | |
| <name>javax.jdo.option.ConnectionUserName</name> | |
| <value>hive</value> | |
| </property> | |
| <property> | |
| <name>javax.jdo.option.ConnectionPassword</name> | |
| <value>hive</value> | |
| </property> | |
| <property> | |
| <name>javax.jdo.option.ConnectionDriverName</name> | |
| <value>com.mysql.jdbc.Driver</value> | |
| </property> | |
| <property> | |
| <name>hive.metastore.uris</name> | |
| <value>thrift://cdh.instance.com:9083</value> | |
| </property> | |
| <property> | |
| <name>hive.metastore.warehouse.dir</name> | |
| <value>hdfs:///user/hive/warehouse</value> | |
| </property> | |
| </configuration> | |
| HIVECNF | |
| # auto-start services | |
| chkconfig hadoop-hdfs-namenode on \ | |
| && chkconfig hadoop-hdfs-datanode on \ | |
| && chkconfig hadoop-yarn-resourcemanager on \ | |
| && chkconfig hadoop-yarn-nodemanager on \ | |
| && chkconfig hive-metastore on \ | |
| && chkconfig hive-server2 on \ | |
| && chkconfig oozie on | |
| # start hadoop processes | |
| if [ ! "$(ps aux | grep hdfs-namenode | wc -l)" == "2" ]; then | |
| service hadoop-hdfs-namenode start | |
| fi | |
| if [ ! "$(ps aux | grep datanode | wc -l)" == "2" ]; then | |
| service hadoop-hdfs-datanode start | |
| fi | |
| if [ ! "$(ps aux | grep resourcemanager | wc -l)" == "2" ]; then | |
| service hadoop-yarn-resourcemanager start | |
| fi | |
| if [ ! "$(ps aux | grep nodemanager | wc -l)" == "2" ]; then | |
| service hadoop-yarn-nodemanager start | |
| fi | |
| echo "Creating HDFS directory structure" \ | |
| &&sudo -u hdfs hdfs dfs -mkdir -p /user \ | |
| && sudo -u hdfs hdfs dfs -chmod -R 777 /user \ | |
| && sudo -u hdfs hdfs dfs -mkdir -p /user/spark \ | |
| && sudo -u hdfs hdfs dfs -chmod -R 755 /user/spark \ | |
| && sudo -u hdfs hdfs dfs -mkdir -p /tmp \ | |
| && sudo -u hdfs hdfs dfs -chmod -R 777 /tmp \ | |
| && sudo -u hdfs hdfs dfs -mkdir -p /user/hive/warehouse \ | |
| && sudo -u hdfs hdfs dfs -chown -R hive:hive /user/hive/warehouse \ | |
| && sudo -u hdfs hdfs dfs -chmod -R 755 /user/hive/warehouse | |
| if [ ! "$(ps aux | grep HiveMetaStore | wc -l)" == "2" ]; then | |
| service hive-metastore start | |
| fi | |
| if [ ! "$(ps aux | grep HiveServer2 | wc -l)" == "2" ]; then | |
| service hive-server2 start | |
| fi | |
| SCRIPT | |
| # OS configuration | |
| $system_config = <<SCRIPT | |
| DEV_USER=hadoop_oozie | |
| DEV_PASSWORD=hadoop | |
| PROXY_CONFIG=/etc/profile.d/proxy.sh | |
| service iptables stop && chkconfig iptables off | |
| if grep ryanair /etc/resolv.conf; then | |
| echo "export http_proxy=http://internalproxy.corp.ryanair.com:3128" > ${PROXY_CONFIG} \ | |
| && echo "export https_proxy=http://internalproxy.corp.ryanair.com:3128" >> ${PROXY_CONFIG} | |
| else | |
| rm -fR ${PROXY_CONFIG} | |
| fi | |
| # Add entries to /etc/hosts | |
| ip=$(ifconfig eth1 | awk -v host=$(hostname) '/inet addr/ {print substr($2,6)}') | |
| host=$(hostname) | |
| echo "127.0.0.1 localhost" > /etc/hosts | |
| echo "$ip $host" >> /etc/hosts | |
| # Add a dev user - don't worry about the password | |
| if ! grep ${DEV_USER} /etc/passwd; then | |
| echo "Creating user ${DEV_USER}" && useradd -p $(openssl passwd -1 ${DEV_PASSWORD}) ${DEV_USER} \ | |
| && echo "${DEV_USER} ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/hadoop_oozie | |
| fi | |
| SCRIPT | |
| $information = <<SCRIPT | |
| ip=$(ifconfig eth1 | awk -v host=$(hostname) '/inet addr/ {print substr($2,6)}') | |
| echo "Guest IP address: $ip" | |
| echo "Namenode's UI available at: http://$ip:50070" | |
| echo "Resource Manager's UI available at: http://$ip:8088" | |
| echo "MySQL root password: hadoop" | |
| echo "You may want to add the below line to /etc/hosts:" | |
| echo "$ip cdh.instance.com" | |
| SCRIPT | |
| Vagrant.configure(2) do |config| | |
| config.vm.box = "boxcutter/centos66" | |
| config.vm.hostname = "cdh.instance.com" | |
| config.vm.network :public_network, :mac => "0800DEADBEEF" | |
| config.vm.provider "virtualbox" do |vb| | |
| vb.name = "cloudera-hadoop" | |
| vb.cpus = 4 | |
| vb.memory = 8192 | |
| vb.customize ["modifyvm", :id, "--nicpromisc2", "allow-all"] | |
| vb.customize ["modifyvm", :id, "--cpuexecutioncap", "100"] | |
| end | |
| config.vm.provision :shell, :name => "system_config", :inline => $system_config | |
| config.vm.provision :shell, :name => "anaconda_deps", :inline => $anaconda_deps | |
| config.vm.provision :shell, :name => "mysql_deps", :inline => $mysql_deps | |
| config.vm.provision :shell, :name => "spark_deps", :inline => $spark_deps | |
| config.vm.provision :shell, :name => "cloudera_deps", :inline => $cloudera_deps | |
| config.vm.provision :shell, :name => "information", :inline => $information | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment