Skip to content

Instantly share code, notes, and snippets.

@nstevens1040
Last active November 4, 2021 19:11
Show Gist options
  • Select an option

  • Save nstevens1040/e921aeafedd25f58e7dd56aad38658ad to your computer and use it in GitHub Desktop.

Select an option

Save nstevens1040/e921aeafedd25f58e7dd56aad38658ad to your computer and use it in GitHub Desktop.
iText7 Extract PDF text from select pages
$REFS = @(
"$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\itext.pdfa.dll",
"$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\Microsoft.Bcl.AsyncInterfaces.dll",
"$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\Microsoft.Extensions.DependencyInjection.Abstractions.dll",
"$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\Microsoft.Extensions.DependencyInjection.dll",
"$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\Microsoft.Extensions.Logging.Abstractions.dll",
"$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\Microsoft.Extensions.Logging.dll",
"$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\Microsoft.Extensions.Options.dll",
"$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\Microsoft.Extensions.Primitives.dll",
"$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\System.Buffers.dll",
"$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\System.Diagnostics.DiagnosticSource.dll",
"$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\System.Memory.dll",
"$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\System.Numerics.Vectors.dll",
"$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\System.Runtime.CompilerServices.Unsafe.dll",
"$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\System.Threading.Tasks.Extensions.dll",
"$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\System.ValueTuple.dll",
"$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\BouncyCastle.Crypto.dll",
"$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\itext.commons.dll",
"$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\itext.io.dll",
"$($BINFOLDER)\itext7-dotnet\itext\itext.pdfa\bin\Debug\net461\itext.kernel.dll"
)
$REFS.ForEach({ Add-Type -Path $_ })
Add-Type -TypeDefinition @"
namespace ExtractPdf
{
using System;
using System.IO;
using System.Text;
using System.Linq;
using iText.IO;
using iText.Pdfa;
using iText.Kernel;
using iText.Kernel.Pdf;
using System.Collections.Generic;
public class Text
{
public static Dictionary<string,string> From(string filepath, int start_page = 1, int end_page = 0)
{
PdfReader pdfReader = new PdfReader(filepath);
PdfDocument pdfDoc = new PdfDocument(pdfReader);
Dictionary<string,string> results = new Dictionary<string,string>();
if(end_page == 0)
{
end_page = pdfDoc.GetNumberOfPages();
}
for(int i = start_page; i <= end_page; i++)
{
iText.Kernel.Pdf.Canvas.Parser.Listener.ITextExtractionStrategy strategy = new iText.Kernel.Pdf.Canvas.Parser.Listener.SimpleTextExtractionStrategy();
string pageContent = iText.Kernel.Pdf.Canvas.Parser.PdfTextExtractor.GetTextFromPage(pdfDoc.GetPage(i), strategy);
string page_num = i.ToString();
results.Add(page_num,pageContent);
}
pdfDoc.Close();
pdfReader.Close();
return results;
}
}
}
"@ -ReferencedAssemblies $REFS
$from_page = 5
$to_page = 10
$source_pdf_file = "C:\.TEMP\PDF\source.pdf"
$output_txt_file = "C:\.TEMP\TXT\destin.txt"
$pdf_ = [ExtractPdf.Text]::From(
$source_pdf_file,
$from_page,
$to_page
)
$space = "$([System.Text.Encoding]::UTF8.GetString([byte]32))"
$cr = "$([System.Text.Encoding]::UTF8.GetString([byte]13))"
$sixty = '============================================================='
$one31 = '-------------------------------------------------------------------------------------------------------------------------------------'
$crlf = "$([System.Text.Encoding]::UTF8.GetString(@([byte]13,[byte]10)))"
$down = "$([System.Text.Encoding]::UTF8.GetString(@(0xe2,0x86,0x93)))"
$up= "$([System.Text.Encoding]::UTF8.GetString(@(0xe2,0x86,0x91)))"
$pages = [System.Collections.Generic.List[string]]::new();
foreach($key in $pdf_.Keys)
{
$page_num = $key
$content = $pdf_[$key]
$formatted = [System.Collections.Generic.List[string]]::New()
foreach($ine in @($content.Split("$([System.Text.Encoding]::UTF8.GetString([byte]10))")))
{
$line = ("| " + $ine).Replace($cr,[string]::Empty)
$fill = 133 - $line.Length - 2;
$spaces = @();(0..$fill).ForEach({ $spaces += $space });$sp = [string]::Join([string]::Empty,$spaces)
$new = $line.Replace($cr,[string]::Empty) + ($sp + "|").Replace($cr,[string]::Empty)
$formatted.Add($new)
}
$fstring = [string]::Join($crlf,$formatted)
$page = "$($down)$($sixty) page $($page_num) $($sixty)$($down)$($crlf)$($fstring)$($crlf)$($up)$($sixty) page $($page_num) $($sixty)$($up)$($crlf)$($one31)"
$pages.Add($page)
}
[string]::Join($crlf,$pages) | Out-File $output_txt_file -Encoding utf8
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment