rmusser01 · December 21, 2022 22:53 · Jun 14, 2021 · May 11, 2021 · May 11, 2021 · May 11, 2021
diff --git a/whey-cewler.py b/whey-cewler.py
@@ -98,7 +98,7 @@ def menu_action(self, event):
             self.roots.add(root)
         # get all sitemap entries associated with the selected messages and scrape them for words
         for http_message in self._callbacks.getSiteMap(None):
-            url = str(http_message.getUrl())
+            url = http_message.getUrl().toString()
             for root in self.roots:
                 # will scrape the same URL multiple times if the site map has stored multiple instances
                 # the site map stores multiple instances if it detects differences, so this is desirable

diff --git a/whey-cewler.py b/whey-cewler.py
@@ -17,7 +17,7 @@
 5. Select the Extensions > Create wordlist context menu item.
 
 The wordlist is created to wherever you have the extension configured for output.
-```
+'''
 
 from burp import IBurpExtender
 from burp import IContextMenuFactory

diff --git a/whey-cewler.py b/whey-cewler.py
@@ -1,3 +1,24 @@
+'''
+Based on the initial work of Digininja at https://github.com/digininja/CeWL. While CeWL is a script written
+in Ruby that requires an independent crawl of a website in order to build a custom wordlist, Whey CeWLer
+runs within Portswigger's Burp Suite and parses an already crawled sitemap to build a custom wordlist. It
+does not have the meta data parsing capabilities that CeWL does, but it more than makes up for it in
+convenience.
+
+The name gets its origins from the CeWLer portion of the CO2 Burp extension by Jason Gillam, which is written
+in Java and does something similar, but Whey CeWLer is a completely reimagined extension written in Python,
+making it "way cooler".
+
+Usage:
+1. Point Burp Suite to Jython in the Extender > Options tab.
+2. Install this extension manually in the Extender > Extensions tab.
+3. Select an option for extension output (File, Console or UI).
+4. Right-click on any element in the Target tab's hierarchical sitemap.
+5. Select the Extensions > Create wordlist context menu item.
+
+The wordlist is created to wherever you have the extension configured for output.
+```
+
 from burp import IBurpExtender
 from burp import IContextMenuFactory
 from javax.swing import JMenuItem

diff --git a/whey-cewler.py b/whey-cewler.py
@@ -8,7 +8,6 @@
 
 COMMON_PASSWORDS = ['password']
 TEXT_CONTENT_TYPES = ['text/html', 'application/xml', 'application/json', 'text/plain']
-DEBUG = False
 
 # helpful resource
 # https://github.com/laconicwolf/burp-extensions/blob/master/GenerateForcedBrowseWordlist.py

diff --git a/whey-cewler.py b/whey-cewler.py
@@ -0,0 +1,126 @@
+from burp import IBurpExtender
+from burp import IContextMenuFactory
+from javax.swing import JMenuItem
+from java.util import ArrayList, List
+from HTMLParser import HTMLParser
+from datetime import datetime
+import re
+
+COMMON_PASSWORDS = ['password']
+TEXT_CONTENT_TYPES = ['text/html', 'application/xml', 'application/json', 'text/plain']
+DEBUG = False
+
+# helpful resource
+# https://github.com/laconicwolf/burp-extensions/blob/master/GenerateForcedBrowseWordlist.py
+
+class TagStripper(HTMLParser):
+    '''
+    Attempts to strip all tags from an HTML page recieved in the HTTP response. The remaining text
+    is appended to an array and then joined with " " for regex parsing.
+    '''
+
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.page_text = []
+
+    def handle_data(self, data):
+        self.page_text.append(data)
+
+    def handle_comment(self, data):
+        self.page_text.append(data)
+
+    def strip(self, html_page):
+        self.feed(html_page)
+        return " ".join(self.page_text)
+
+
+class BurpExtender(IBurpExtender, IContextMenuFactory):
+    '''
+    BurpExtender Class as per Reference API.
+    '''
+
+    def registerExtenderCallbacks(self, callbacks):
+        '''
+        Registers the extension and initializes the root URLs and word list sets.
+        '''
+        self._callbacks = callbacks
+        self._helpers = callbacks.getHelpers()
+        self.context = None
+        self.roots = set()
+        self.word_list = set(COMMON_PASSWORDS)
+        callbacks.setExtensionName("Whey CeWLer")
+        callbacks.registerContextMenuFactory(self)
+        return
+
+    def createMenuItems(self, context):
+        '''
+        Invokes the "Create Wordlist" Menu.
+        '''
+
+        # HOW TO BIND ONLY TO SITEMAP
+
+        self.context = context
+        if context.getInvocationContext() == context.CONTEXT_TARGET_SITE_MAP_TREE:
+            menu_list = ArrayList()
+            menu_item = JMenuItem("Create Wordlist", actionPerformed=self.menu_action)
+            menu_list.add(menu_item)
+            return menu_list
+
+    def menu_action(self, event):
+        '''
+        Obtains the selected messages from the interface. Filters the sitmap for all messages containing
+        URLs within the selected messages' hierarchy. If so, the message is analyzed to create a word list.
+        '''
+        # get all first-level selected messages and store the URLs as roots to filter the sitemap
+        http_messages = self.context.getSelectedMessages()
+        for http_message in http_messages:
+            root = str(http_message.getUrl())
+            self.roots.add(root)
+        # get all sitemap entries associated with the selected messages and scrape them for words
+        for http_message in self._callbacks.getSiteMap(None):
+            url = str(http_message.getUrl())
+            for root in self.roots:
+                # will scrape the same URL multiple times if the site map has stored multiple instances
+                # the site map stores multiple instances if it detects differences, so this is desirable
+                if url.startswith(root):
+                    # only scrape if there is a response to scrape
+                    http_response = http_message.getResponse()
+                    if http_response:
+                        self.get_words(url, http_response)
+        self.display_words()
+        return
+
+    def get_words(self, url, http_response):
+        '''
+        Checks the header for a text-based content type. If the content type is text-based, uses
+        the TagStripper class to parse out the text and runs a regex to create a wordlist based on
+        the regex criteria. The resulting words are added to the word_list set.
+        '''
+        response = self._helpers.analyzeResponse(http_response)
+        headers = response.getHeaders()[1:]
+        body = self._helpers.bytesToString(http_response[response.getBodyOffset():])
+        for header in headers:
+            name, value = [x.strip() for x in header.split(':', 1)]
+            if name.lower() == 'content-type':
+                content_type = value.split(';')[0].strip()
+                if content_type.lower() not in TEXT_CONTENT_TYPES:
+                    return
+        tag_stripper = TagStripper()
+        page_text = tag_stripper.strip(body)
+        # alpha numerics and apostrophes
+        # at least 3 characters in length
+        word_candidates = re.findall(r"[\w']{3,}", page_text)
+        for word in word_candidates:
+            # strip apostrophes
+            word = word.replace("'", "")
+            # add the word to the list
+            self.word_list.add(word)
+        return
+
+    def display_words(self):
+        '''
+        Displays the word list to whatever Burp is configured for stdout.
+        '''
+        for word in sorted(self.word_list):
+            print word
+        return