Instance: p2.xlarge
AMI ID: ubuntu/images/hvm-ssd/ubuntu-xenial-16.04-amd64-server-20170221
EBS volume for root: 30GB
| # train_grpo.py | |
| import re | |
| import torch | |
| from datasets import load_dataset, Dataset | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from peft import LoraConfig | |
| from trl import GRPOConfig, GRPOTrainer | |
| # Load and prep dataset |
| import java.io.{ByteArrayOutputStream, File} | |
| import java.nio.charset.StandardCharsets | |
| import java.sql.{Date, Timestamp} | |
| import java.util.UUID | |
| import java.util.concurrent.atomic.AtomicLong | |
| import scala.util.Random | |
| import org.scalatest.Matchers._ |
| 1. Install minikube | |
| 2. Start minikube with enough cpus and memory | |
| minikube start --memory='8196mb' --cpus=4 | |
| 3. The Pod of spark doesn't specify systemaccount, so it is "default". Spark will create pod. So we should give enough | |
| permission to "default" systemaccount. Create role by kubectl and bind the role to systemaccount default | |
| kubectl create role default --verb=get,list,watch,create,update,patch,delete --resource=pods,pods/status | |
| kubectl create rolebinding default-binding --role=default --serviceaccount=default:default --namespace=default | |
| 4. Build Spark images. Remember to build PySpark image too. | |
| ./bin/docker-image-tool.sh -m -t dev -p resource-managers/kubernetes/docker/src/main/dockerfiles/spark/bindings/python/Dockerfile build |
| ================================================================================================ | |
| filter & aggregate without group | |
| ================================================================================================ | |
| OpenJDK 64-Bit Server VM 1.8.0_212-8u212-b03-0ubuntu1.18.04.1-b03 on Linux 4.15.0-1021-aws | |
| Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz | |
| range/filter/sum: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative | |
| ------------------------------------------------------------------------------------------------------------------------ | |
| range/filter/sum wholestage off 46264 47546 1814 45.3 22.1 1.0X | |
| range/filter/sum wholestage on 3156 3523 206 664.5 1.5 14.7X |
| # Update EC2 packages | |
| sudo yum install cmake boost-devel.x86_64 boost-python.x86_64 boost-serialization.x86_64 -y | |
| sudo yum install swig blas-devel.x86_64 lapack-devel.x86_64 -y | |
| # Install Python packages | |
| sudo pip install numpy bitarray |
| # See: http://www.lecloud.net/post/61401763496/install-update-to-python-2-7-and-latest-pip-on-ec2 | |
| # install build tools | |
| sudo yum install make automake gcc gcc-c++ kernel-devel git-core -y | |
| # install python 2.7 and change default python symlink | |
| # python27-devel or python27-python-devel.x86_64 | |
| sudo yum install python27-devel -y | |
| sudo rm /usr/bin/python | |
| sudo ln -s /usr/bin/python2.7 /usr/bin/python |
| use strict; | |
| use JSON; | |
| use Data::Dumper; | |
| open(TWEET_STAT, "<$ARGV[0]"); | |
| my $rows = []; | |
| while (<TWEET_STAT>) { |
| /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ | |
| /* */ | |
| /* Simple node js module to get distance between two coordinates. */ | |
| /* */ | |
| /* Code transformed from Chris Veness example code - please refer to his website for licensing */ | |
| /* questions. */ | |
| /* */ | |
| /* */ | |
| /* Latitude/longitude spherical geodesy formulae & scripts (c) Chris Veness 2002-2011 */ | |
| /* - www.movable-type.co.uk/scripts/latlong.html */ |
| # Basic text search with relevancy for MongoDB. | |
| # See http://blog.tty.nl/2010/02/08/simple-ranked-text-search-for-mongodb/ | |
| # Copythingie 2010 - Ward Bekker - [email protected] | |
| #create (or empty) a docs collection | |
| doc_col = MongoMapper.connection.db('example_db').collection('docs') | |
| doc_col.remove({}) | |
| #add some sample data | |
| doc_col.insert({ "txt" => "it is what it is"}) |