Skip to content

Instantly share code, notes, and snippets.

@bartekdobija
Last active August 31, 2015 13:33
Show Gist options
  • Save bartekdobija/4ca798a5407f57ab26a3 to your computer and use it in GitHub Desktop.
Save bartekdobija/4ca798a5407f57ab26a3 to your computer and use it in GitHub Desktop.
#!/usr/bin/env bash
# In this case I have a Hadoop distro compiled from source:
# MAVEN_OPTS="-Xms512m -Xmx1024m" mvn package -Pdist,native -DskipTests -Dtar
# verified with:
# hadoop checknative -a
# with output:
# Native library checking:
# hadoop: true /usr/local/hadoop-2.6.0/lib/native/libhadoop.so.1.0.0
# zlib: true /lib64/libz.so.1
# snappy: true /usr/lib64/libsnappy.so.1
# lz4: true revision:99
# bzip2: true /lib64/libbz2.so.1
# openssl: true /usr/lib64/libcrypto.so
# Spark without hadoop dependencies.
# Don't forget to install snappy & snappy-devel on RHEL/CentOS etc.
# Spark dependencies should be configured as per this document https://spark.apache.org/docs/latest/hadoop-provided.html
####### spark-defaults.conf: #######
#spark.yarn.jar hdfs:///user/spark/share/lib/spark-assembly-1.5.0-SNAPSHOT-hadoop2.6.0.jar
#spark.ui.enabled false
##spark.shuffle.spill false
##spark.shuffle.spill.compress true
##spark.shuffle.consolidateFiles true
##spark.shuffle.service.enabled true
## Execution Behavior
#spark.broadcast.blockSize 4096
## Dynamic Resource Allocation (YARN)
##spark.dynamicAllocation.enabled true
##spark.dynamicAllocation.executorIdleTimeout 10800
##spark.dynamicAllocation.initialExecutors 3
##spark.speculation true
#spark.scheduler.mode FAIR
#spark.executor.memory 5G
#spark.kryoserializer.buffer.max 1000m
#spark.driver.maxResultSize 0
#spark.serializer org.apache.spark.serializer.KryoSerializer
#spark.yarn.preserve.staging.files false
#spark.master yarn
#spark.rdd.compress true
## Local execution of selected Spark functions
#spark.localExecution.enabled true
#spark.sql.parquet.binaryAsString true
#spark.sql.parquet.compression.codec snappy
## use lz4 compression for broadcast variables as Snappy is not supported on MacOSX
#spark.broadcast.compress true
#spark.io.compression.codec lz4
#spark.driver.extraLibraryPath /usr/local/hadoop/lib/native/
#spark.executor.extraLibraryPath /opt/cloudera/parcels/CDH/lib/hadoop/lib/native
#spark.executor.extraClassPath /usr/local/lib/jdbc/sqlserver/*.jar:/usr/local/lib/jdbc/mysql/*.jar:/usr/local/anaconda/bin
####### spark-env.sh #######
# HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop/
# SPARK_DIST_CLASSPATH=$(hadoop classpath)
# LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/hadoop/lib/native/
./make-distribution.sh --name without-hadoop --tgz -Phadoop-2.6 -Psparkr -Phadoop-provided -Phive -Phive-thriftserver -Pyarn -DzincPort=3038 -DskipTests -Dmaven.javadoc.skip=true
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment