Snehal snehalnair

I am a Data-Scientist with credible experience in data mining and developing statistical models combined with dealing with extraction, transformation & analysis

30 followers · 4 following

Scottish Power
United Kingdom
snehalnair.github.io

View GitHub Profile

Recently created

Least recently created

Recently updated

Least recently updated

snehalnair / construct-estimator-object

Last active September 10, 2020 09:01

	import sagemaker

	# Assign a foldername
	key_prefix = 'aws_model_xgboost'

	#Initiate sagemaker session
	session = sagemaker.Session()

	#get container with the training code
	from sagemaker.amazon.amazon_estimator import get_image_uri

snehalnair / nb_tuning

Created July 9, 2020 20:09

	from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

	# Create ParamGrid and Evaluator for Cross Validation
	paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.5, 2.0]).build()
	cvEvaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")

	# Run Cross-validation
	cv = CrossValidator(estimator=nb, estimatorParamMaps=paramGrid, evaluator=cvEvaluator)
	cvModel = cv.fit(train)

snehalnair / nlp_pipeline_data_transformation

Created July 9, 2020 19:57

	stages = []
	# 1. clean data and tokenize sentences using RegexTokenizer
	regexTokenizer = RegexTokenizer(inputCol="sms", outputCol="tokens", pattern="\\W+")
	stages += [regexTokenizer]

	# 2. CountVectorize the data
	cv = CountVectorizer(inputCol="tokens", outputCol="token_features", minDF=2.0)#, vocabSize=3, minDF=2.0
	stages += [cv]

	# 3. Convert the labels to numerical values using binariser

snehalnair / get_binary_data

Created July 6, 2020 06:35

	def get_binary_data(ratings):
	ratings = ratings.withColumn('binary', fn.lit(1))
	userIds = ratings.select("userId").distinct()
	movieIds = ratings.select("movieId").distinct()

	user_movie = userIds.crossJoin(movieIds).join(ratings, ['userId', 'movieId'], "left")
	user_movie = user_movie.select(['userId', 'movieId', 'binary']).fillna(0)
	return user_movie

	user_movie = get_binary_data(ratings)

snehalnair / calculate_sparsity

Created July 5, 2020 09:41

	def get_mat_sparsity(ratings):
	# Count the total number of ratings in the dataset
	count_nonzero = ratings.select("rating").count()

	# Count the number of distinct userIds and distinct movieIds
	total_elements = ratings.select("userId").distinct().count() * ratings.select("movieId").distinct().count()

	# Divide the numerator by the denominator
	sparsity = (1.0 - (count_nonzero 1.0)/total_elements)100
	print("The ratings dataframe is ", "%.2f" % sparsity + "% sparse.")

snehalnair / get_bilstm_lstm_model

Created June 28, 2020 17:05

	def get_bilstm_lstm_model():
	model = Sequential()

	# Add Embedding layer
	model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

	# Add bidirectional LSTM
	model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

	# Add LSTM

snehalnair / get_pad_train_test_val

Last active June 28, 2020 08:46

	def get_pad_train_test_val(data_group, data):

	#get max token and tag length
	n_token = len(list(set(data['Word'].to_list())))
	n_tag = len(list(set(data['Tag'].to_list())))

	#Pad tokens (X var)
	tokens = data_group['Word_idx'].tolist()
	maxlen = max([len(s) for s in tokens])
	pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)

snehalnair / token2idx_tag2idx

Created June 28, 2020 06:15

	from itertools import chain
	def get_dict_map(data, token_or_tag):
	tok2idx = {}
	idx2tok = {}

	if token_or_tag == 'token':
	vocab = list(set(data['Word'].to_list()))
	else:
	vocab = list(set(data['Tag'].to_list()))

snehalnair / replace_onehot_labels

Created June 27, 2020 08:42

	def get_pred_labels(data, predictions):
	y_cols = list(data.columns[2:])
	y_label_dict={}
	for k,v in enumerate(y_cols):
	y_label_dict[k] = v

	test_predictions_labels = []
	for pred in predictions:
	label_pred = []
	for index, label in enumerate(list(pred)):

snehalnair / evaluation_scores

Last active June 24, 2020 05:07

	def print_evaluation_scores(y_test, predicted):

	print('Accuracy: ', accuracy_score(y_test, predicted, normalize=False))
	print('F1-score macro: ', f1_score(y_test, predicted, average='macro'))
	print('F1-score micro: ', f1_score(y_test, predicted, average='micro'))
	print('F1-score weighted: ', f1_score(y_test, predicted, average='weighted'))
	print('Precision macro: ', average_precision_score(y_test, predicted, average='macro'))
	print('Precision micro: ', average_precision_score(y_test, predicted, average='micro'))
	print('Precision weighted: ', average_precision_score(y_test, predicted, average='weighted'))

NewerOlder