Skip to content

Instantly share code, notes, and snippets.

@santisq
Forked from indented-automation/Get-FileEncoding.ps1
Created November 1, 2022 14:18
Show Gist options
  • Select an option

  • Save santisq/b10a90b6e2298150cdefd54b39c643db to your computer and use it in GitHub Desktop.

Select an option

Save santisq/b10a90b6e2298150cdefd54b39c643db to your computer and use it in GitHub Desktop.

Revisions

  1. @indented-automation indented-automation revised this gist Nov 1, 2022. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions Get-FileEncoding.ps1
    Original file line number Diff line number Diff line change
    @@ -55,6 +55,7 @@ function Get-FileEncoding {
    'RTF' = '7B-5C-72-74-66-31'
    'GIF' = '47-49-46-38'
    'REGPOL' = '50-52-65-67'
    'GZIP' = '1F-8B'
    'JPEG' = 'FF-D8'
    'MSEXE' = '4D-5A'
    'ZIP' = '50-4B'
  2. @indented-automation indented-automation revised this gist May 20, 2020. 1 changed file with 6 additions and 5 deletions.
    11 changes: 6 additions & 5 deletions Get-FileEncoding.ps1
    Original file line number Diff line number Diff line change
    @@ -100,11 +100,12 @@ function Get-FileEncoding {
    }

    [PSCustomObject]@{
    Name = Split-Path $Path -Leaf
    Extension = [System.IO.Path]::GetExtension($Path)
    Encoding = $encoding
    Path = $Path
    } | Add-Member -TypeName 'EncodingInfo' -PassThru
    Name = Split-Path $Path -Leaf
    Extension = [System.IO.Path]::GetExtension($Path)
    Encoding = $encoding
    Path = $Path
    PSTypeName = 'EncodingInfo'
    }
    } catch {
    $pscmdlet.WriteError($_)
    }
  3. @indented-automation indented-automation created this gist May 1, 2018.
    112 changes: 112 additions & 0 deletions Get-FileEncoding.ps1
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,112 @@
    using namespace System.Collections.Generic; using namespace System.Linq

    function Get-FileEncoding {
    <#
    .SYNOPSIS
    Attempt to determine a file type based on a BOM or file header.
    .DESCRIPTION
    This script attempts to determine file types based on a byte sequence at the beginning of the file.
    If an identifiable byte sequence is not present the file type cannot be determined using this method.
    The order signatures appear in is critical where signatures overlap. For example, UTF32-LE must be evaluated before UTF16-LE.
    .LINK
    https://en.wikipedia.org/wiki/Byte_order_mark#cite_note-b-15
    https://filesignatures.net
    #>

    [CmdletBinding()]
    [OutputType('EncodingInfo')]
    param (
    # The path to a file to analyze.
    [Parameter(Mandatory, Position = 1, ValueFromPipeline, ValueFromPipelineByPropertyName)]
    [ValidateScript( { Test-Path $_ -PathType Leaf } )]
    [Alias('FullName')]
    [String]$Path,

    # Test the file against a small set of signature definitions for binary file types.
    #
    # Identification should be treated as tentative. Several file formats cannot be identified using the sequence at the start alone.
    [Switch]$IncludeBinary
    )

    begin {
    $signatures = [Ordered]@{
    'UTF32-LE' = 'FF-FE-00-00'
    'UTF32-BE' = '00-00-FE-FF'
    'UTF8' = 'EF-BB-BF'
    'UTF16-LE' = 'FF-FE'
    'UTF16-BE' = 'FE-FF'
    'UTF7' = '2B-2F-76-38', '2B-2F-76-39', '2B-2F-76-2B', '2B-2F-76-2F'
    'UTF1' = 'F7-64-4C'
    'UTF-EBCDIC' = 'DD-73-66-73'
    'SCSU' = '0E-FE-FF'
    'BOCU-1' = 'FB-EE-28'
    'GB-18030' = '84-31-95-33'
    }

    if ($IncludeBinary) {
    $signatures += [Ordered]@{
    'LNK' = '4C-00-00-00-01-14-02-00'
    'MSEXCEL' = '50-4B-03-04-14-00-06-00'
    'PNG' = '89-50-4E-47-0D-0A-1A-0A'
    'MSOFFICE' = 'D0-CF-11-E0-A1-B1-1A-E1'
    '7ZIP' = '37-7A-BC-AF-27-1C'
    'RTF' = '7B-5C-72-74-66-31'
    'GIF' = '47-49-46-38'
    'REGPOL' = '50-52-65-67'
    'JPEG' = 'FF-D8'
    'MSEXE' = '4D-5A'
    'ZIP' = '50-4B'
    }
    }

    # Convert sequence strings to byte arrays. Intended to simplify signature maintenance.
    [String[]]$keys = $signatures.Keys
    foreach ($name in $keys) {
    [List[List[Byte]]]$values = foreach ($value in $signatures[$name]) {
    [List[Byte]]$signatureBytes = foreach ($byte in $value.Split('-')) {
    [Convert]::ToByte($byte, 16)
    }
    ,$signatureBytes
    }
    $signatures[$name] = $values
    }
    }

    process {
    try {
    $Path = $pscmdlet.GetUnresolvedProviderPathFromPSPath($Path)

    $bytes = [Byte[]]::new(8)
    $stream = [System.IO.File]::OpenRead($Path)
    $null = $stream.Read($bytes, 0, $bytes.Count)
    $bytes = [List[Byte]]$bytes
    $stream.Close()

    $encoding = foreach ($name in $signatures.Keys) {
    $sampleEncoding = foreach ($sequence in $signatures[$name]) {
    $sample = $bytes.GetRange(0, $sequence.Count)

    if ([System.Linq.Enumerable]::SequenceEqual($sample, $sequence)) {
    $name
    break
    }
    }
    if ($sampleEncoding) {
    $sampleEncoding
    break
    }
    }

    [PSCustomObject]@{
    Name = Split-Path $Path -Leaf
    Extension = [System.IO.Path]::GetExtension($Path)
    Encoding = $encoding
    Path = $Path
    } | Add-Member -TypeName 'EncodingInfo' -PassThru
    } catch {
    $pscmdlet.WriteError($_)
    }
    }
    }