#!/usr/local/bin/python3
'''
FACEBOOK GROUP DATA DOWNLOADER

Put this file (fbg.py) in a directory alongside a file (groups.csv)
with two comma separated columns [group,id].

Add your personal FB credentials to this file (fbg.py, line 25-26).

Run fbg.py with Python in Terminal:
$ python fbg.py
'''
# Import and setup
import os
import re
import glob
import pandas as pd
from fb_scrape_public import fb_scrape_public as fsp
import shutil
import time
import xlsxwriter
import csv

# FB credentials
client_id = ''
client_secret = ''

# Read the file
groups = pd.DataFrame.from_csv("groups.csv", index_col= None)

# Cleanup data folders
try:
    shutil.rmtree('posts_data')
    shutil.rmtree('comments_data')
except: pass

# Get everything
grps = [g.replace('/', '_') for g in groups.group.tolist()]

for group, id in zip(grps, groups.id.tolist()):

    print("Processing " + group)

    # Reset data folders
    try:
        shutil.rmtree('posts_data')
        shutil.rmtree('comments_data')
    except:
        pass

    # Create posts data dir
    os.makedirs('posts_data')

    # Get the posts
    comments = fsp.scrape_fb(client_id, client_secret, str(id))  # use fb_scrape_public
    filename = str(id) + "posts.csv"
    os.rename("fb_data.csv", "posts_data/" + filename)  # save the file to the data directory
    post_file = glob.glob("posts_data/*.csv")
    posts_df = pd.read_csv(post_file[0])
    posts_df = posts_df.replace({'\n|\t|\r': ' '}, regex=True) # remove linebreaks and tabs in the dataframe

    # Add a copy of the message column for future sorting purposes
    posts_df['post_sorting'] = posts_df['message']

    # Get the post ids from the posts dataframe to be able to scrape the comments
    post_ids = posts_df.iloc[:, 16].tolist()

    # Create comments data dir
    os.makedirs('comments_data')

    # Get the comments
    for count, id in enumerate(post_ids):
        print("Getting comments to post " + str(count+1) + "/" + str(len(post_ids)) + " (" + id + ")")

        try:
            comments = fsp.scrape_fb(client_id, client_secret, id, scrape_mode="comments")  # use fb_scrape_public
            filename = str(id) + "_scraped.csv"
            os.rename("fb_data.csv", "comments_data/"+filename)  # save the file to the data directory

        except Exception as e:
            time.sleep(2)

    # Create a comments dataframe
    all_comments_files = glob.glob("comments_data/*.csv")

    # Make a list of dataframes and add a column with group_id
    df_from_each_file = []
    for count, f in enumerate(all_comments_files):
        f2 = f[14:-12]
        f2= f2.split('_')
        f2 = f2[0]
        df = pd.read_csv(f)
        df['group_id'] = f2
        df_from_each_file.append(df)

        print("Reading file " + str(count+1) + "/" + str(len(all_comments_files)))
  
    if len(df_from_each_file) > 0:
        
        print("Joining files ...")
        
        comments_df = pd.concat(df_from_each_file, ignore_index=True)
        print("Done!")

        # Rename the 'original_message' column for future sorting purposes
        comments_df.rename(columns={'original_message': 'post_sorting'}, inplace=True)

        # Keep only some of the columns in comments_df
        comments_df = comments_df.loc[:,['from', 'comment','created_time','group_id', 'post_sorting']]

        # Add a column 'Type' to the comments_df
        comments_df['type']='comment'

        # Rename column 'comment' to 'message'
        comments_df.rename(columns={'comment': 'message'}, inplace=True)

        # Keep only some of the columns in posts_df
        posts_df = posts_df.loc[:,['from', 'message','type','created_time', 'group_id', 'post_sorting']]

        # Reorder columns to match order of comments_df
        posts_df = posts_df[['from', 'message', 'created_time', 'group_id','post_sorting','type']]

        fb_groups = pd.concat([posts_df, comments_df], ignore_index=True)

        # Reorder columns
        fb_groups = fb_groups[['group_id', 'post_sorting', 'created_time', 'type', 'from','message']]

        # Rename column 'created_time' to 'time'
        fb_groups.rename(columns={'created_time': 'time'}, inplace=True)

        # Clean up 'time' column
        fb_groups['time'] = fb_groups['time'].str.replace('-','')
        fb_groups['time'] = fb_groups['time'].str.replace('T','_')
        fb_groups['time'] = fb_groups['time'].str.replace(':','')
        fb_groups['time'] = fb_groups['time'].str.replace('\+0000','')
        fb_groups['time'] = fb_groups['time'].str.replace('^20','')

        # Sort first by thread id, then by date
        fb_groups = fb_groups.sort_values(['group_id','time','post_sorting'], ascending=True)

        # Replace linebreaks and tabs in the dataframe
        fb_groups = fb_groups.replace({'\n|\r|\t': ' '}, regex=True)

        # Sort first by thread id, then by date
        fb_groups = fb_groups.sort_values(['post_sorting','time'])

        file_name = "FBgroup_" + group +  ".csv"

        # Save to file
        fb_groups.to_csv(file_name, index = False)
        print("File " + file_name + " saved.\n")
    
    else:
        print("Unable to get data from this group.")
        pass
        
try:
    shutil.rmtree('posts_data')
    shutil.rmtree('comments_data')
except: pass

# When done, also create xlsx versions of all csvs
csvs = glob.glob("FBgroup*")

for csvfile in csvs:
    workbook = xlsxwriter.Workbook(csvfile[:-4] + ".xlsx") # create a workbook
    worksheet = workbook.add_worksheet("DATA") # add and name a worksheet
    
    with open(csvfile, "r") as input_csv:
        table = csv.reader(input_csv)
        i = 0
        
        for row in table:
            worksheet.write_row(i, 0, row)
            i += 1
    workbook.close()