import re import sqlite3 import uuid import requests from bs4 import BeautifulSoup MAX_PAGE = 10000 COOKIES = { 'over18': '1' } con = sqlite3.connect('ptt.sqlite') cur = con.cursor() cur.execute("create table if not exists post (id, author, title)") root_url = 'https://www.ptt.cc/bbs' root_page = f'{root_url}/Gossiping' def get_post_urls(page): page_url = f'{root_page}/index{page}.html' res = requests.get(page_url, cookies=COOKIES) text = res.content urls = re.findall(r'/bbs/Gossiping/M[\w+\.]*.html', text.decode('utf-8')) return urls def save_meta_to_sql(post_id, title, author): cur.execute('INSERT INTO post (id, author, title) VALUES (?, ?, ?)', [post_id, title, author]) con.commit() def save_content_as_file(post_id, post): with open(post_id, 'w') as fd: fd.write(post) def parse_author_and_title(post): parsed_post = BeautifulSoup(post) post_meta = parsed_post.find_all(class_='article-meta-value') post_meta = [meta.text for meta in post_meta] author , _, title, __ = post_meta return author, title def save_post(text): post_id = str(uuid.uuid1()) author, title = parse_author_and_title(text) save_content_as_file(post_id, text) save_meta_to_sql(post_id, author, title) def craw_post_content(post_url): full_url = f'{root_url}{post_url[4:]}' res = requests.get(full_url, cookies=COOKIES) text = res.content save_post(text.decode('utf-8')) def main(): for page in range(MAX_PAGE): urls = get_post_urls(page) for url in urls: craw_post_content(url) if __name__ == '__main__': main()