from bs4 import BeautifulSoup import pandas as pd from glob import glob from os.path import join from yaml import load, Loader import re class TGMsgLoader: def __init__(self, msg_dir, config_path): self.file_names = glob(join(msg_dir, 'messages*.html')) sort_key_pattern = re.compile('messages(\d*).html') self.file_names.sort( key=lambda file_name: int(re.search(sort_key_pattern, file_name) .groups()[0] or 0)) with open(config_path) as fd: self.sel_config: dict = load(fd.read(), Loader=Loader) def extract(self): for path in self.file_names: print('*' * 10, path, '*' * 10) # debug with open(path) as fd: html_content = fd.read() soup = BeautifulSoup(html_content, 'lxml') msgs = soup.select(self.sel_config['messages']) for msg in msgs: result = dict() for var, config in self.sel_config['each_msg'].items(): v = msg.select_one(config['selector']) if v is not None: v = eval(config['value']) result[var] = v # print('{date}\n{name}\n{text}\n{sep_}\n\n'.format( # sep_="=" * 30, **result)) yield result def save_to_csv(self, file_name='tmp.csv'): df = pd.DataFrame([msg for msg in self.extract()]) # hard code df['name'] = df['name'].ffill() df['date'] = pd.to_datetime(df['date'], format="%d.%m.%Y %H:%M:%S") df.to_csv(file_name) return df if __name__ == '__main__': import argparse parser = argparse.ArgumentParser( description="A script that transform messages exported from Telegram" " to CSV file with some config file.") parser.add_argument('dir', help="The directory that message exported from" " Telegram in.") parser.add_argument('--file', help="The name CSV file will be.", dest="file", default="tmp.csv") parser.add_argument('--config', help="The configuration that content" " on HTML should be", dest="config", default="text_sels.yaml") args = parser.parse_args() tg_loader = TGMsgLoader(args.dir, args.config) df = tg_loader.save_to_csv(args.file) # tg_loader = TGMsgLoader('AIS3-chats/official', 'text_sels.yaml') # df = tg_loader.save_to_csv('official.csv')