import re
import sqlite3
import uuid

import requests
from bs4 import BeautifulSoup

MAX_PAGE = 10000
COOKIES = {
    'over18': '1'
}

con = sqlite3.connect('ptt.sqlite')
cur = con.cursor()
cur.execute("create table if not exists post (id, author, title)")

root_url = 'https://www.ptt.cc/bbs'
root_page = f'{root_url}/Gossiping'


def get_post_urls(page):
    page_url = f'{root_page}/index{page}.html'
    res = requests.get(page_url, cookies=COOKIES)
    text = res.content
    urls = re.findall(r'/bbs/Gossiping/M[\w+\.]*.html', text.decode('utf-8'))
    return urls

def save_meta_to_sql(post_id, title, author):
    cur.execute('INSERT INTO post (id, author, title) VALUES (?, ?, ?)', [post_id, title, author])
    con.commit()

def save_content_as_file(post_id, post):
    with open(post_id, 'w') as fd:
        fd.write(post)

def parse_author_and_title(post):
    parsed_post = BeautifulSoup(post)
    post_meta = parsed_post.find_all(class_='article-meta-value')
    post_meta = [meta.text for meta in post_meta]
    author , _, title, __ = post_meta
    return author, title

def save_post(text):
    post_id = str(uuid.uuid1())
    author, title = parse_author_and_title(text)
    save_content_as_file(post_id, text)
    save_meta_to_sql(post_id, author, title)


def craw_post_content(post_url):
    full_url = f'{root_url}{post_url[4:]}'
    res = requests.get(full_url, cookies=COOKIES)
    text = res.content
    save_post(text.decode('utf-8'))

def main():
    for page in range(MAX_PAGE):
        urls = get_post_urls(page)
        for url in urls:
            craw_post_content(url)


if __name__ == '__main__':
    main()