Skip to content

Instantly share code, notes, and snippets.

View snehalnair's full-sized avatar

Snehal snehalnair

View GitHub Profile
import sagemaker
# Assign a foldername
key_prefix = 'aws_model_xgboost'
#Initiate sagemaker session
session = sagemaker.Session()
#get container with the training code
from sagemaker.amazon.amazon_estimator import get_image_uri
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ParamGrid and Evaluator for Cross Validation
paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.5, 2.0]).build()
cvEvaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
# Run Cross-validation
cv = CrossValidator(estimator=nb, estimatorParamMaps=paramGrid, evaluator=cvEvaluator)
cvModel = cv.fit(train)
stages = []
# 1. clean data and tokenize sentences using RegexTokenizer
regexTokenizer = RegexTokenizer(inputCol="sms", outputCol="tokens", pattern="\\W+")
stages += [regexTokenizer]
# 2. CountVectorize the data
cv = CountVectorizer(inputCol="tokens", outputCol="token_features", minDF=2.0)#, vocabSize=3, minDF=2.0
stages += [cv]
# 3. Convert the labels to numerical values using binariser
def get_binary_data(ratings):
ratings = ratings.withColumn('binary', fn.lit(1))
userIds = ratings.select("userId").distinct()
movieIds = ratings.select("movieId").distinct()
user_movie = userIds.crossJoin(movieIds).join(ratings, ['userId', 'movieId'], "left")
user_movie = user_movie.select(['userId', 'movieId', 'binary']).fillna(0)
return user_movie
user_movie = get_binary_data(ratings)
def get_mat_sparsity(ratings):
# Count the total number of ratings in the dataset
count_nonzero = ratings.select("rating").count()
# Count the number of distinct userIds and distinct movieIds
total_elements = ratings.select("userId").distinct().count() * ratings.select("movieId").distinct().count()
# Divide the numerator by the denominator
sparsity = (1.0 - (count_nonzero *1.0)/total_elements)*100
print("The ratings dataframe is ", "%.2f" % sparsity + "% sparse.")
def get_bilstm_lstm_model():
model = Sequential()
# Add Embedding layer
model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))
# Add bidirectional LSTM
model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))
# Add LSTM
def get_pad_train_test_val(data_group, data):
#get max token and tag length
n_token = len(list(set(data['Word'].to_list())))
n_tag = len(list(set(data['Tag'].to_list())))
#Pad tokens (X var)
tokens = data_group['Word_idx'].tolist()
maxlen = max([len(s) for s in tokens])
pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)
from itertools import chain
def get_dict_map(data, token_or_tag):
tok2idx = {}
idx2tok = {}
if token_or_tag == 'token':
vocab = list(set(data['Word'].to_list()))
else:
vocab = list(set(data['Tag'].to_list()))
def get_pred_labels(data, predictions):
y_cols = list(data.columns[2:])
y_label_dict={}
for k,v in enumerate(y_cols):
y_label_dict[k] = v
test_predictions_labels = []
for pred in predictions:
label_pred = []
for index, label in enumerate(list(pred)):
def print_evaluation_scores(y_test, predicted):
print('Accuracy: ', accuracy_score(y_test, predicted, normalize=False))
print('F1-score macro: ', f1_score(y_test, predicted, average='macro'))
print('F1-score micro: ', f1_score(y_test, predicted, average='micro'))
print('F1-score weighted: ', f1_score(y_test, predicted, average='weighted'))
print('Precision macro: ', average_precision_score(y_test, predicted, average='macro'))
print('Precision micro: ', average_precision_score(y_test, predicted, average='micro'))
print('Precision weighted: ', average_precision_score(y_test, predicted, average='weighted'))