Created
August 20, 2021 04:29
-
-
Save kaenova/3ed8ba3d2899d96221cc01dca353f90d to your computer and use it in GitHub Desktop.
Revisions
-
kaenova created this gist
Aug 20, 2021 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,131 @@ import pandas as pd import re import string from tqdm import tqdm from Sastrawi.Stemmer.StemmerFactory import StemmerFactory class DataCleaning: # Initialization factory = StemmerFactory() stemmer = factory.create_stemmer() kamus_alay1 = pd.read_csv('https://raw.githubusercontent.com/fendiirfan/Kamus-Alay/main/Kamu-Alay.csv') kamus_alay1 = kamus_alay1.set_index('kataAlay') kamus_alay2 = pd.read_csv('https://raw.githubusercontent.com/nasalsabila/kamus-alay/master/colloquial-indonesian-lexicon.csv') kamus_alay2 = kamus_alay2.filter(['slang', 'formal'], axis=1) kamus_alay2 = kamus_alay2.drop_duplicates(subset=['slang'], keep='first') kamus_alay2 = kamus_alay2.set_index('slang') stopword1 = list(pd.read_csv('https://raw.githubusercontent.com/datascienceid/stopwords-bahasa-indonesia/master/stopwords_id_satya.txt', header = None)[0]) custom_word = [] #Isikan dengan @classmethod def CleanDataFrame(cls, df, col_name, jum_minimum=None, minimum_kata=0): ''' CleanDataFram(DataFrame, NamaKolom, JumlahDataMinimum, MinimumKata) -> DataFrame Hasil dari eksekusi ini mengembalikan dataframe yang berisi data yang telah dibersihkan sesuai DataCleaning.__cleanSentence()__ ''' final_list_clean = [] final_list_kotor = [] if jum_minimum == None: jum_minimum = len(df) if len(df) < jum_minimum: raise "Jumlah Data Yang Diinginkan melebihi Data yang Ada" i = 0 current = 0 while i < len(df): current_kalimat = df.loc[i][col_name] clean_kalimat = cls.__cleanSentence__(current_kalimat) if (len(clean_kalimat.split(' ')) > minimum_kata): final_list_clean.append(clean_kalimat) final_list_kotor.append(current_kalimat) current += 1 if current % 10 == 0: print("Memproses {} data".format(current)) if current == jum_minimum: break i += 1 data = { 'raw': final_list_kotor, 'processed': final_list_clean } return pd.DataFrame(data) @classmethod def CleanSentence(cls, text): return cls.__cleanSentence__(text) @classmethod def __cleanSentence__(cls, text): ''' Melakukan prapemrosesan pada suatu kalimat dengan menghilangkan formatting pada kalimat, menghilangkan stopword pada kalimat, mengganti kata alay yang sudah terdefinisikan, serta melakukan stemming kalimat tersebut. ''' # # # Cleaning Formatted Text using Regex # # text = re.sub(r'http\S+', '', text) text = re.sub('(@\w+|#\w+)','',text) #will replace the html characters with " " text=re.sub('<.*?>', '', text) #To remove the punctuations ## kuganti jadi gini biar pasti, kalau pakai cara yang dulu, banyak kata2 yang kegabung -kaenova temp_text = list(text) for i in range(len(temp_text)): if temp_text[i] in string.punctuation: temp_text[i] = " " text = ''.join(temp_text) ## sebelumnya kaya gini -kaenova # text = text.translate(str.maketrans(' ',' ',string.punctuation)) #will consider only alphabets text = re.sub('[^a-zA-Z]',' ',text) #will replace newline with space text = re.sub("\n"," ",text) #will convert to lower case text = text.lower() # will replace a word text = re.sub("(username|user|url|rt|xf|fx|xe|xa)\s|\s(user|url|rt|xf|fx|xe|xa)","",text) # will repalce repated char text = re.sub(r'(\w)(\1{2,})', r"\1", text) # will replace single word text = re.sub(r"\b[a-zA-Z]\b","",text) # will replace space more than one text = re.sub('(s{2,})',' ',text) # will join the words text=' '.join(text.split()) text_split = text.split(' ') # # # Mengganti kata-kata yang tidak baku # aku gapakai try catch lagi, lebih simple malah ini # # for i in range(len(text_split)): if text_split[i] in cls.kamus_alay1.index: text_split[i] = cls.kamus_alay1.loc[text_split[i]]['kataBaik'] elif text_split[i] in cls.kamus_alay2.index: text_split[i] = cls.kamus_alay2.loc[text_split[i]]['formal'] else: pass # # # Stemming # # stemmed_text = cls.stemmer.stem(text) # # # Removing Stopwords and custom word # # temp_text_split = [] for i in range(len(text_split)): if (text_split[i] not in cls.stopword1) and (text_split[i] not in cls.custom_word): temp_text_split.append(text_split[i]) final_text = ' '.join(temp_text_split) return final_text