Skip to content

Instantly share code, notes, and snippets.

@scudette
Created June 1, 2023 09:43
Show Gist options
  • Save scudette/40f49fb64383eed489667ca9fade93f4 to your computer and use it in GitHub Desktop.
Save scudette/40f49fb64383eed489667ca9fade93f4 to your computer and use it in GitHub Desktop.

Revisions

  1. scudette created this gist Jun 1, 2023.
    98 changes: 98 additions & 0 deletions Generic.Search.PDF.yaml
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,98 @@
    name: Generic.Search.PDF
    description: |
    This artifact searches PDF files for a keyword.
    parameters:
    - name: PDFGlob
    default: /tmp/*.pdf
    description: A glob to find PDF files
    - name: YaraRule
    type: yara
    default: |
    rule X {
    strings:
    $a = "SECRET" wide nocase
    condition: any of them
    }
    column_types:
    - name: CompressedStream
    type: upload_preview
    - name: TextStream
    type: upload_preview
    - name: DeflatedStream
    type: upload_preview
    - name: Upload
    type: upload_preview
    - name: Match
    type: hex

    sources:
    - query: |
    -- To decompress a deflate stream convert to a gzip file by
    -- slapping a header on top and removing the deflate header.
    LET Deflate(Stream) = read_file(
    length=100000,
    accessor="gzip",
    filename=pathspec(
    DelegateAccessor="data",
    DelegatePath="\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03" + Stream[2:]))
    LET MatchRule(Data) = SELECT String.Data AS Hit,
    String.HexData as HexHit,
    String.Offset AS Offset
    FROM yara(accessor="data",
    files=Data,
    number=1, context=50,
    rules=YaraRule)
    LIMIT 1
    LET Parsed(Filename) = SELECT parse_string_with_regex(
    string=Object,
    regex='''(?sm)(?P<Header>(?P<ObjNumber>\d+ \d+) obj[\r\n]+.+?stream)\r?\n(?P<Stream>.+?)endstream''') AS Data
    FROM parse_records_with_regex(
    file=Filename,
    regex='''(?sm)(?P<Object>\d+ \d+ obj[\n\r]+.+?[\r\n]+endobj[\r\n]+)''',
    buffer_size=1000000)
    WHERE Data.Header =~ "/FlateDecode"
    AND log(message=format(format="%v: Inspecting object %v of size %v",
    args=[Filename, Data.ObjNumber, len(list=Data.Stream)]))
    LET Letters(Data) = SELECT X
    FROM parse_records_with_regex(
    accessor="data",
    file=Data,
    regex='''\((?P<X>[^/()]+?)\)''')
    LET Text(Data) = join(sep="",array=Letters(Data=Data).X)
    LET Decoded(Filename) = SELECT Data.ObjNumber AS ObjNumber,
    Data.Header AS Header,
    Data.Stream AS CompressedStream,
    Deflate(Stream=Data.Stream) AS DeflatedStream,
    Text(Data=Deflate(Stream=Data.Stream)) AS TextStream
    FROM Parsed(Filename=Filename)
    LET SearchFile(Filename) = SELECT ObjNumber, Header,
    upload(file=Filename) AS Upload,
    upload(accessor="data", file=CompressedStream,
    name=Filename + ObjNumber + "Compressed") as CompressedStream,
    upload(accessor="data", file=DeflatedStream,
    name=Filename + ObjNumber + "Deflated") AS DeflatedStream,
    upload(accessor="data", file=TextStream,
    name=Filename + ObjNumber + "Text") AS TextStream,
    MatchRule(Data=DeflatedStream + TextStream) AS Match
    FROM Decoded(Filename=Filename)
    WHERE Match
    LIMIT 1
    SELECT * FROM foreach(row={
    SELECT * FROM glob(globs=PDFGlob)
    }, query={
    SELECT OSPath, ObjNumber, Header,
    CompressedStream, DeflatedStream,
    TextStream, Upload,
    format(format="%02x", args=Match.Hit) AS Match,
    Match[0].Offset AS MatchOffset
    FROM SearchFile(Filename=OSPath)
    })