argha0x · December 4, 2021 04:28
diff --git a/gistfile1.txt b/gistfile1.txt
 #please import pandas as pd and then install these in kaggle notebook
 # work with these three data
 # take en.tsv
 # filter 'en' sentences from train.tsv
 # filter 'en' sentences from valid.tsv

 #pip install langdetect
 from langdetect import detect

 #filter 'en' from valid.tsv

 columns = list(df)
 data = []
 count = 0
 len_df = len(df)
 while count < len_df:
    str_ = df.iloc[count]['Sentence']
    if(len(str_.split()) < 1000 and detect(str_)=='en'): #ignore shitty data
        dat_ = df.iloc[count]['Sentence']
        relation = df.iloc[count]['Relation']
        ner1 = df.iloc[count]['NER1']
        ner2 = df.iloc[count]['NER2']
        zipped = zip(columns, [relation, dat_, ner1, ner2])
        a_dictionary = dict(zipped)
        data.append(a_dictionary)
    count += 1

 df = pd.DataFrame(None)

 #contains only en
 df = df.append(data, True)

 #similar loop for valid.tsv
	#please import pandas as pd and then install these in kaggle notebook
	# work with these three data
	# take en.tsv
	# filter 'en' sentences from train.tsv
	# filter 'en' sentences from valid.tsv

	#pip install langdetect
	from langdetect import detect

	#filter 'en' from valid.tsv

	columns = list(df)
	data = []
	count = 0
	len_df = len(df)
	while count < len_df:
	str_ = df.iloc[count]['Sentence']
	if(len(str_.split()) < 1000 and detect(str_)=='en'): #ignore shitty data
	dat_ = df.iloc[count]['Sentence']
	relation = df.iloc[count]['Relation']
	ner1 = df.iloc[count]['NER1']
	ner2 = df.iloc[count]['NER2']
	zipped = zip(columns, [relation, dat_, ner1, ner2])
	a_dictionary = dict(zipped)
	data.append(a_dictionary)
	count += 1

	df = pd.DataFrame(None)

	#contains only en
	df = df.append(data, True)

	#similar loop for valid.tsv