Created
July 1, 2016 18:49
-
-
Save edglazer/35f1c33b65a85d75892eade0dbcad8c1 to your computer and use it in GitHub Desktop.
Revisions
-
edglazer created this gist
Jul 1, 2016 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,264 @@ Timer unit: 1e-06 s Total time: 1401.2 s File: tribe/extract.py Function: extract_graph at line 105 Line # Hits Time Per Hit % Time Line Contents ============================================================== 105 @profile 106 def extract_graph(self): 107 """ 108 Extracts a Graph where the nodes are EmailAddress 109 """ 110 111 1 3 3.0 0.0 def relationships(email): 112 """ 113 Inner function that constructs email relationships 114 """ 115 people = [email.sender,] 116 people.extend(email.recipients) 117 people.extend(email.copied) 118 119 people = filter(lambda p: p is not None, people) # Filter out any None addresses 120 people = set(addr.email for addr in people if addr.email) # Obtain only unique people 121 people = sorted(people) # Sort lexicographically for combinations 122 123 for combo in combinations(people, 2): 124 yield combo 125 126 127 # Keep track of all the email to email links 128 1 18 18.0 0.0 links = FreqDist() 129 130 # Iterate over all the extracted emails 131 # Catch exceptions, if any, and move forward 132 # NOTE: This will allow the progress bar to work 133 # NOTE: This will build the graph data structure in memory 134 114956 1159836727 10089.4 82.8 for email in self.extract(): 135 114955 136897 1.2 0.0 try: 136 4870025 7657826 1.6 0.5 for combo in relationships(email): 137 4755070 7686760 1.6 0.5 links[combo] += 1 138 except Exception as e: 139 self.errors[e] += 1 140 continue 141 142 # Construct the networkx graph and add edges 143 1 66 66.0 0.0 G = nx.Graph(name="Email Network", mbox=self.path, extracted=strfnow()) 144 4173 67408 16.2 0.0 for link in links.keys(): 145 4173 225812988 54112.9 16.1 G.add_edge(*link, weight=links.freq(link)) 146 147 # Return the generated graph 148 return G ➜ tribe git:(master) ✗ pip install -U memory_profiler Requirement already up-to-date: memory_profiler in /usr/local/lib/python2.7/site-packages ➜ tribe git:(master) ✗ pip install psutil Requirement already satisfied (use --upgrade to upgrade): psutil in /usr/local/lib/python2.7/site-packages ➜ tribe git:(master) ✗ pip install -U psutil Requirement already up-to-date: psutil in /usr/local/lib/python2.7/site-packages ➜ tribe git:(master) ✗ kernprof --help Usage: kernprof [-s setupfile] [-o output_file_path] scriptfile [arg] ... Options: --version show program's version number and exit -h, --help show this help message and exit -l, --line-by-line Use the line-by-line profiler from the line_profiler module instead of Profile. Implies --builtin. -b, --builtin Put 'profile' in the builtins. Use 'profile.enable()' and 'profile.disable()' in your code to turn it on and off, or '@profile' to decorate a single function, or 'with profile:' to profile a single section of code. -o OUTFILE, --outfile=OUTFILE Save stats to <outfile> -s SETUP, --setup=SETUP Code to execute before the code to profile -v, --view View the results of the profile in addition to saving it. ➜ tribe git:(master) ✗ python -m memory_profiler tribe-admin.py extract -w myemails.graphml allmail.mbox Starting Graph extraction, a long running process Initializing MBox iteration on allmail.mbox (8.8GiB) ^CFilename: tribe/extract.py seconds | Parsed: 97431 emails Line # Mem usage Increment Line Contents ================================================ 105 70.691 MiB 0.000 MiB @profile 106 def extract_graph(self): 107 """ 108 Extracts a Graph where the nodes are EmailAddress 109 """ 110 111 490.387 MiB 419.695 MiB def relationships(email): 112 """ 113 Inner function that constructs email relationships 114 """ 115 490.387 MiB 0.000 MiB people = [email.sender,] 116 490.387 MiB 0.000 MiB people.extend(email.recipients) 117 490.387 MiB 0.000 MiB people.extend(email.copied) 118 119 490.387 MiB 0.000 MiB people = filter(lambda p: p is not None, people) # Filter out any None addresses 120 490.387 MiB 0.000 MiB people = set(addr.email for addr in people if addr.email) # Obtain only unique people 121 490.387 MiB 0.000 MiB people = sorted(people) # Sort lexicographically for combinations 122 123 490.387 MiB 0.000 MiB for combo in combinations(people, 2): 124 490.387 MiB 0.000 MiB yield combo 125 126 127 # Keep track of all the email to email links 128 70.691 MiB -419.695 MiB links = FreqDist() 129 130 # Iterate over all the extracted emails 131 # Catch exceptions, if any, and move forward 132 # NOTE: This will allow the progress bar to work 133 # NOTE: This will build the graph data structure in memory 134 490.387 MiB 419.695 MiB for email in self.extract(): 135 490.387 MiB 0.000 MiB try: 136 490.387 MiB 0.000 MiB for combo in relationships(email): 137 490.387 MiB 0.000 MiB links[combo] += 1 138 except Exception as e: 139 self.errors[e] += 1 140 continue 141 142 # Construct the networkx graph and add edges 143 G = nx.Graph(name="Email Network", mbox=self.path, extracted=strfnow()) 144 for link in links.keys(): 145 G.add_edge(*link, weight=links.freq(link)) 146 147 # Return the generated graph 148 return G Traceback (most recent call last): File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 162, in _run_module_as_main "__main__", fname, loader, pkg_name) File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 72, in _run_code exec code in run_globals File "/usr/local/lib/python2.7/site-packages/memory_profiler.py", line 982, in <module> exec_with_profiler(script_filename, prof) File "/usr/local/lib/python2.7/site-packages/memory_profiler.py", line 917, in exec_with_profiler execfile(filename, ns, ns) File "tribe-admin.py", line 179, in <module> main(*sys.argv[1:]) File "tribe-admin.py", line 173, in main msg = args.func(args) # Call the default function File "tribe-admin.py", line 91, in extract errors, seconds = timed_inner(args.mbox[0], args.write) File "tribe/utils.py", line 118, in wrapper result = func(*args, **kwargs) File "tribe-admin.py", line 86, in timed_inner G = reader.extract_graph() File "/usr/local/lib/python2.7/site-packages/memory_profiler.py", line 498, in f return func(*args, **kwds) File "tribe/extract.py", line 134, in extract_graph for email in self.extract(): File "tribe/extract.py", line 97, in extract for msg in self: File "tribe/extract.py", line 177, in __iter__ for msg in super(ConsoleMBoxReader, self).__iter__(): File "tribe/extract.py", line 46, in __iter__ for msg in self.mbox: File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/mailbox.py", line 108, in itervalues value = self[key] File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/mailbox.py", line 80, in __getitem__ return self.get_message(key) File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/mailbox.py", line 775, in get_message msg = self._message_factory(string.replace(os.linesep, '\n')) File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/mailbox.py", line 1598, in __init__ Message.__init__(self, message) File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/mailbox.py", line 1459, in __init__ self._become_message(email.message_from_string(message)) File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/email/__init__.py", line 57, in message_from_string return Parser(*args, **kws).parsestr(s) File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/email/parser.py", line 82, in parsestr return self.parse(StringIO(text), headersonly=headersonly) File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/email/parser.py", line 71, in parse feedparser.feed(data) File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/email/feedparser.py", line 178, in feed self._call_parse() File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/email/feedparser.py", line 182, in _call_parse self._parse() File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/email/feedparser.py", line 373, in _parsegen for retval in self._parsegen(): File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/email/feedparser.py", line 446, in _parsegen if line is NeedMoreData: File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/email/feedparser.py", line 446, in _parsegen if line is NeedMoreData: File "/usr/local/lib/python2.7/site-packages/memory_profiler.py", line 537, in trace_memory_usage def trace_memory_usage(self, frame, event, arg): KeyboardInterrupt Exception KeyboardInterrupt in <module 'threading' from '/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.pyc'> ignored ^X% ➜ tribe git:(master) ✗ ➜ tribe git:(master) ✗ kernprof -l tribe-admin.py extract -w myemails.graphml allmail.mbox Starting Graph extraction, a long running process Initializing MBox iteration on allmail.mbox (8.8GiB) ^CWrote profile results to tribe-admin.py.lprof. Elapsed: 14 minutes 41 seconds | initializing ... Session Restored Last login: Fri Jul 1 11:44:11 on console You have new mail. ➜ tribe git:(master) ✗ kernprof -l tribe-admin.py extract -w myemails.graphml allmail.mbox Starting Graph extraction, a long running process Initializing MBox iteration on allmail.mbox (8.8GiB) Elapsed: 18 minutes 35 seconds | Parsed: 114856 emails GraphML written out to myemails.graphml No errors encountered in processing Graph extraction took 19 minutes 44 seconds Wrote profile results to tribe-admin.py.lprof ➜ tribe git:(master) ✗ python -m line_profiler tribe-admin.py.lprof Timer unit: 1e-06 s Total time: 1119.33 s File: tribe/extract.py Function: extract_graph at line 105 Line # Hits Time Per Hit % Time Line Contents ============================================================== 105 @profile 106 def extract_graph(self): 107 """ 108 Extracts a Graph where the nodes are EmailAddress 109 """ 110 111 1 13 13.0 0.0 def relationships(email): 112 """ 113 Inner function that constructs email relationships 114 """ 115 people = [email.sender,] 116 people.extend(email.recipients) 117 people.extend(email.copied) 118 119 people = filter(lambda p: p is not None, people) # Filter out any None addresses 120 people = set(addr.email for addr in people if addr.email) # Obtain only unique people 121 people = sorted(people) # Sort lexicographically for combinations 122 123 for combo in combinations(people, 2): 124 yield combo 125 126 127 # Keep track of all the email to email links 128 1 8 8.0 0.0 links = FreqDist() 129 130 # Iterate over all the extracted emails 131 # Catch exceptions, if any, and move forward 132 # NOTE: This will allow the progress bar to work 133 # NOTE: This will build the graph data structure in memory 134 114956 1094585725 9521.8 97.8 for email in self.extract(): 135 114955 116953 1.0 0.0 try: 136 4870025 7024773 1.4 0.6 for combo in relationships(email): 137 4755070 7022225 1.5 0.6 links[combo] += 1 138 except Exception as e: 139 self.errors[e] += 1 140 continue 141 142 # Construct the networkx graph and add edges 143 1 68 68.0 0.0 G = nx.Graph(name="Email Network", mbox=self.path, extracted=strfnow()) 144 1837532 1518627 0.8 0.1 for link in links.keys(): 145 G.add_edge(*link, weight=links.freq(link)) # took out Changed per suggestion of @bbengfort, as calcultion may be too strenuous 146 1837531 9065502 4.9 0.8 # G.add_edge(*link, weight=links[link]) 147 148 # Return the generated graph 149 1 3 3.0 0.0 return G