import re from pymongo import MongoClient from settings import DEST_DATABASE_URL if __name__ == '__main__': with open("CREDENTIALS") as f: credentials = f.read().strip() client = MongoClient(DEST_DATABASE_URL.replace("$(cat CREDENTIALS)", credentials)) db = client.linkedin_data coll = db.contact_raw counts = {"total": 0, "with_email": 0} for rec in coll.find({}, {"summary": 1}).sort("$natural", -1): counts["total"] += 1 if rec.get("summary"): emails = re.findall(r'\S+@\w+\.\w+', str(rec["summary"])) if emails: print(emails) counts["with_email"] += 1 if counts["total"] % 1000 == 0: print(counts) # {'total': 6400000, 'with_email': 43571} # 11,000,000 records, .006807969, 75k emails