#!/usr/local/bin/python3 ''' FACEBOOK GROUP DATA DOWNLOADER Put this file (fbg.py) in a directory alongside a file (groups.csv) with two comma separated columns [group,id]. Add your personal FB credentials to this file (fbg.py, line 25-26). Run fbg.py with Python in Terminal: $ python fbg.py ''' # Import and setup import os import re import glob import pandas as pd from fb_scrape_public import fb_scrape_public as fsp import shutil import time import xlsxwriter import csv # FB credentials client_id = '' client_secret = '' # Read the file groups = pd.DataFrame.from_csv("groups.csv", index_col= None) # Cleanup data folders try: shutil.rmtree('posts_data') shutil.rmtree('comments_data') except: pass # Get everything grps = [g.replace('/', '_') for g in groups.group.tolist()] for group, id in zip(grps, groups.id.tolist()): print("Processing " + group) # Reset data folders try: shutil.rmtree('posts_data') shutil.rmtree('comments_data') except: pass # Create posts data dir os.makedirs('posts_data') # Get the posts comments = fsp.scrape_fb(client_id, client_secret, str(id)) # use fb_scrape_public filename = str(id) + "posts.csv" os.rename("fb_data.csv", "posts_data/" + filename) # save the file to the data directory post_file = glob.glob("posts_data/*.csv") posts_df = pd.read_csv(post_file[0]) posts_df = posts_df.replace({'\n|\t|\r': ' '}, regex=True) # remove linebreaks and tabs in the dataframe # Add a copy of the message column for future sorting purposes posts_df['post_sorting'] = posts_df['message'] # Get the post ids from the posts dataframe to be able to scrape the comments post_ids = posts_df.iloc[:, 16].tolist() # Create comments data dir os.makedirs('comments_data') # Get the comments for count, id in enumerate(post_ids): print("Getting comments to post " + str(count+1) + "/" + str(len(post_ids)) + " (" + id + ")") try: comments = fsp.scrape_fb(client_id, client_secret, id, scrape_mode="comments") # use fb_scrape_public filename = str(id) + "_scraped.csv" os.rename("fb_data.csv", "comments_data/"+filename) # save the file to the data directory except Exception as e: time.sleep(2) # Create a comments dataframe all_comments_files = glob.glob("comments_data/*.csv") # Make a list of dataframes and add a column with group_id df_from_each_file = [] for count, f in enumerate(all_comments_files): f2 = f[14:-12] f2= f2.split('_') f2 = f2[0] df = pd.read_csv(f) df['group_id'] = f2 df_from_each_file.append(df) print("Reading file " + str(count+1) + "/" + str(len(all_comments_files))) if len(df_from_each_file) > 0: print("Joining files ...") comments_df = pd.concat(df_from_each_file, ignore_index=True) print("Done!") # Rename the 'original_message' column for future sorting purposes comments_df.rename(columns={'original_message': 'post_sorting'}, inplace=True) # Keep only some of the columns in comments_df comments_df = comments_df.loc[:,['from', 'comment','created_time','group_id', 'post_sorting']] # Add a column 'Type' to the comments_df comments_df['type']='comment' # Rename column 'comment' to 'message' comments_df.rename(columns={'comment': 'message'}, inplace=True) # Keep only some of the columns in posts_df posts_df = posts_df.loc[:,['from', 'message','type','created_time', 'group_id', 'post_sorting']] # Reorder columns to match order of comments_df posts_df = posts_df[['from', 'message', 'created_time', 'group_id','post_sorting','type']] fb_groups = pd.concat([posts_df, comments_df], ignore_index=True) # Reorder columns fb_groups = fb_groups[['group_id', 'post_sorting', 'created_time', 'type', 'from','message']] # Rename column 'created_time' to 'time' fb_groups.rename(columns={'created_time': 'time'}, inplace=True) # Clean up 'time' column fb_groups['time'] = fb_groups['time'].str.replace('-','') fb_groups['time'] = fb_groups['time'].str.replace('T','_') fb_groups['time'] = fb_groups['time'].str.replace(':','') fb_groups['time'] = fb_groups['time'].str.replace('\+0000','') fb_groups['time'] = fb_groups['time'].str.replace('^20','') # Sort first by thread id, then by date fb_groups = fb_groups.sort_values(['group_id','time','post_sorting'], ascending=True) # Replace linebreaks and tabs in the dataframe fb_groups = fb_groups.replace({'\n|\r|\t': ' '}, regex=True) # Sort first by thread id, then by date fb_groups = fb_groups.sort_values(['post_sorting','time']) file_name = "FBgroup_" + group + ".csv" # Save to file fb_groups.to_csv(file_name, index = False) print("File " + file_name + " saved.\n") else: print("Unable to get data from this group.") pass try: shutil.rmtree('posts_data') shutil.rmtree('comments_data') except: pass # When done, also create xlsx versions of all csvs csvs = glob.glob("FBgroup*") for csvfile in csvs: workbook = xlsxwriter.Workbook(csvfile[:-4] + ".xlsx") # create a workbook worksheet = workbook.add_worksheet("DATA") # add and name a worksheet with open(csvfile, "r") as input_csv: table = csv.reader(input_csv) i = 0 for row in table: worksheet.write_row(i, 0, row) i += 1 workbook.close()