Skip to content

Instantly share code, notes, and snippets.

@nstevens1040
Last active November 4, 2021 19:11
Show Gist options
  • Select an option

  • Save nstevens1040/e921aeafedd25f58e7dd56aad38658ad to your computer and use it in GitHub Desktop.

Select an option

Save nstevens1040/e921aeafedd25f58e7dd56aad38658ad to your computer and use it in GitHub Desktop.

Revisions

  1. nstevens1040 revised this gist Nov 4, 2021. 1 changed file with 31 additions and 2 deletions.
    33 changes: 31 additions & 2 deletions iText7_Copy_PDF_Text_.ps1
    Original file line number Diff line number Diff line change
    @@ -62,8 +62,37 @@ namespace ExtractPdf
    $from_page = 5
    $to_page = 10
    $source_pdf_file = "C:\.TEMP\PDF\source.pdf"
    $content = [ExtractPdf.Text]::From(
    $output_txt_file = "C:\.TEMP\TXT\destin.txt"
    $pdf_ = [ExtractPdf.Text]::From(
    $source_pdf_file,
    $from_page,
    $to_page
    )
    )

    $space = "$([System.Text.Encoding]::UTF8.GetString([byte]32))"
    $cr = "$([System.Text.Encoding]::UTF8.GetString([byte]13))"
    $sixty = '============================================================='
    $one31 = '-------------------------------------------------------------------------------------------------------------------------------------'
    $crlf = "$([System.Text.Encoding]::UTF8.GetString(@([byte]13,[byte]10)))"
    $down = "$([System.Text.Encoding]::UTF8.GetString(@(0xe2,0x86,0x93)))"
    $up= "$([System.Text.Encoding]::UTF8.GetString(@(0xe2,0x86,0x91)))"
    $pages = [System.Collections.Generic.List[string]]::new();
    foreach($key in $pdf_.Keys)
    {
    $page_num = $key
    $content = $pdf_[$key]
    $formatted = [System.Collections.Generic.List[string]]::New()
    foreach($ine in @($content.Split("$([System.Text.Encoding]::UTF8.GetString([byte]10))")))
    {
    $line = ("| " + $ine).Replace($cr,[string]::Empty)
    $fill = 133 - $line.Length - 2;
    $spaces = @();(0..$fill).ForEach({ $spaces += $space });$sp = [string]::Join([string]::Empty,$spaces)
    $new = $line.Replace($cr,[string]::Empty) + ($sp + "|").Replace($cr,[string]::Empty)
    $formatted.Add($new)
    }
    $fstring = [string]::Join($crlf,$formatted)
    $page = "$($down)$($sixty) page $($page_num) $($sixty)$($down)$($crlf)$($fstring)$($crlf)$($up)$($sixty) page $($page_num) $($sixty)$($up)$($crlf)$($one31)"
    $pages.Add($page)
    }
    [string]::Join($crlf,$pages) | Out-File $output_txt_file -Encoding utf8

  2. nstevens1040 created this gist Nov 4, 2021.
    69 changes: 69 additions & 0 deletions iText7_Copy_PDF_Text_.ps1
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,69 @@
    $REFS = @(
    "$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\itext.pdfa.dll",
    "$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\Microsoft.Bcl.AsyncInterfaces.dll",
    "$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\Microsoft.Extensions.DependencyInjection.Abstractions.dll",
    "$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\Microsoft.Extensions.DependencyInjection.dll",
    "$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\Microsoft.Extensions.Logging.Abstractions.dll",
    "$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\Microsoft.Extensions.Logging.dll",
    "$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\Microsoft.Extensions.Options.dll",
    "$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\Microsoft.Extensions.Primitives.dll",
    "$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\System.Buffers.dll",
    "$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\System.Diagnostics.DiagnosticSource.dll",
    "$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\System.Memory.dll",
    "$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\System.Numerics.Vectors.dll",
    "$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\System.Runtime.CompilerServices.Unsafe.dll",
    "$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\System.Threading.Tasks.Extensions.dll",
    "$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\System.ValueTuple.dll",
    "$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\BouncyCastle.Crypto.dll",
    "$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\itext.commons.dll",
    "$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\itext.io.dll",
    "$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\itext.kernel.dll"
    )
    $REFS.ForEach({ Add-Type -Path $_ })

    Add-Type -TypeDefinition @"
    namespace ExtractPdf
    {
    using System;
    using System.IO;
    using System.Text;
    using System.Linq;
    using iText.IO;
    using iText.Pdfa;
    using iText.Kernel;
    using iText.Kernel.Pdf;
    using System.Collections.Generic;
    public class Text
    {
    public static Dictionary<string,string> From(string filepath, int start_page = 1, int end_page = 0)
    {
    PdfReader pdfReader = new PdfReader(filepath);
    PdfDocument pdfDoc = new PdfDocument(pdfReader);
    Dictionary<string,string> results = new Dictionary<string,string>();
    if(end_page == 0)
    {
    end_page = pdfDoc.GetNumberOfPages();
    }
    for(int i = start_page; i <= end_page; i++)
    {
    iText.Kernel.Pdf.Canvas.Parser.Listener.ITextExtractionStrategy strategy = new iText.Kernel.Pdf.Canvas.Parser.Listener.SimpleTextExtractionStrategy();
    string pageContent = iText.Kernel.Pdf.Canvas.Parser.PdfTextExtractor.GetTextFromPage(pdfDoc.GetPage(i), strategy);
    string page_num = i.ToString();
    results.Add(page_num,pageContent);
    }
    pdfDoc.Close();
    pdfReader.Close();
    return results;
    }
    }
    }
    "@ -ReferencedAssemblies $REFS

    $from_page = 5
    $to_page = 10
    $source_pdf_file = "C:\.TEMP\PDF\source.pdf"
    $content = [ExtractPdf.Text]::From(
    $source_pdf_file,
    $from_page,
    $to_page
    )