Skip to content

Instantly share code, notes, and snippets.

@smalot
Last active January 6, 2025 20:04
Show Gist options
  • Save smalot/6183152 to your computer and use it in GitHub Desktop.
Save smalot/6183152 to your computer and use it in GitHub Desktop.

Revisions

  1. smalot revised this gist Aug 8, 2013. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion PdfParser.php
    Original file line number Diff line number Diff line change
    @@ -4,7 +4,7 @@
    * @file
    * Class PdfParser
    *
    * @author : Sébastien MALOT <[email protected]>
    * @author : Sebastien MALOT <[email protected]>
    * @date : 2013-08-08
    *
    * References :
  2. smalot renamed this gist Aug 8, 2013. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  3. smalot revised this gist Aug 8, 2013. 1 changed file with 3 additions and 0 deletions.
    3 changes: 3 additions & 0 deletions PdfParser
    Original file line number Diff line number Diff line change
    @@ -3,6 +3,9 @@
    /**
    * @file
    * Class PdfParser
    *
    * @author : Sébastien MALOT <[email protected]>
    * @date : 2013-08-08
    *
    * References :
    * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
  4. smalot created this gist Aug 8, 2013.
    334 changes: 334 additions & 0 deletions PdfParser
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,334 @@
    <?php

    /**
    * @file
    * Class PdfParser
    *
    * References :
    * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
    * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
    * - http://www.php.net/manual/en/ref.pdf.php#74211
    */
    class PdfParser
    {
    /**
    * Parse PDF file
    *
    * @param string $filename
    * @return string
    */
    public static function parseFile($filename)
    {
    $content = file_get_contents($filename);

    return self::extractText($content);
    }

    /**
    * Parse PDF content
    *
    * @param string $content
    * @return string
    */
    public static function parseContent($content)
    {
    return self::extractText($content);
    }

    /**
    * Convert a PDF into text.
    *
    * @param string $filename The filename to extract the data from.
    * @return string The extracted text from the PDF
    */
    protected static function extractText($data)
    {
    /**
    * Split apart the PDF document into sections. We will address each
    * section separately.
    */
    $a_obj = self::getDataArray($data, 'obj', 'endobj');
    $j = 0;
    $a_chunks = array();

    /**
    * Attempt to extract each part of the PDF document into a 'filter'
    * element and a 'data' element. This can then be used to decode the
    * data.
    */
    foreach ($a_obj as $obj) {
    $a_filter = self::getDataArray($obj, '<<', '>>');

    if (is_array($a_filter) && isset($a_filter[0])) {
    $a_chunks[$j]['filter'] = $a_filter[0];
    $a_data = self::getDataArray($obj, 'stream', 'endstream');

    if (is_array($a_data) && isset($a_data[0])) {
    $a_chunks[$j]['data'] = trim(substr($a_data[0], strlen('stream'), strlen($a_data[0]) - strlen('stream') - strlen('endstream')));
    }

    $j++;
    }
    }

    $result_data = null;

    // decode the chunks
    foreach ($a_chunks as $chunk) {
    // Look at each chunk decide if we can decode it by looking at the contents of the filter
    if (isset($chunk['data'])) {

    // look at the filter to find out which encoding has been used
    if (strpos($chunk['filter'], 'FlateDecode') !== false) {
    // Use gzuncompress but suppress error messages.
    $data =@ gzuncompress($chunk['data']);
    } else {
    $data = $chunk['data'];
    }

    if (trim($data) != '') {
    // If we got data then attempt to extract it.
    $result_data .= ' ' . self::extractTextElements($data);
    }
    }
    }

    /**
    * Make sure we don't have large blocks of white space before and after
    * our string. Also extract alphanumerical information to reduce
    * redundant data.
    */
    if (trim($result_data) == '') {
    return null;
    } else {
    // Optimize hyphened words
    $result_data = preg_replace('/\s*-[\r\n]+\s*/', '', $result_data);
    $result_data = preg_replace('/\s+/', ' ', $result_data);

    return $result_data;
    }
    }

    protected static function extractTextElements($content)
    {
    if (strpos($content, '/CIDInit') === 0) {
    return '';
    }

    $text = '';
    $lines = explode("\n", $content);

    foreach ($lines as $line) {
    $line = trim($line);
    $matches = array();

    // Parse each lines to extract command and operator values
    if (preg_match('/^(?<command>.*[\)\] ])(?<operator>[a-z]+[\*]?)$/i', $line, $matches)) {
    $command = trim($matches['command']);

    // Convert octal encoding
    $found_octal_values = array();
    preg_match_all('/\\\\([0-9]{3})/', $command, $found_octal_values);

    foreach($found_octal_values[0] as $value) {
    $octal = substr($value, 1);

    if (intval($octal) < 40) {
    // Skips non printable chars
    $command = str_replace($value, '', $command);
    } else {
    $command = str_replace($value, chr(octdec($octal)), $command);
    }
    }
    // Removes encoded new lines, tabs, ...
    $command = preg_replace('/\\\\[\r\n]/', '', $command);
    $command = preg_replace('/\\\\[rnftb ]/', ' ', $command);
    // Force UTF-8 charset
    $encoding = mb_detect_encoding($command, array('ASCII', 'UTF-8', 'Windows-1252', 'ISO-8859-1'));
    if (strtoupper($encoding) != 'UTF-8') {
    if ($decoded = @iconv('CP1252', 'UTF-8//TRANSLIT//IGNORE', $command)) {
    $command = $decoded;
    }
    }
    // Removes leading spaces
    $operator = trim($matches['operator']);
    } else {
    $command = $line;
    $operator = '';
    }

    // Handle main operators
    switch ($operator) {
    // Set character spacing.
    case 'Tc':
    break;

    // Move text current point.
    case 'Td':
    $values = explode(' ', $command);
    $y = array_pop($values);
    $x = array_pop($values);
    if ($x > 0) {
    $text .= ' ';
    }
    if ($y < 0) {
    $text .= ' ';
    }
    break;

    // Move text current point and set leading.
    case 'TD':
    $values = explode(' ', $command);
    $y = array_pop($values);
    if ($y < 0) {
    $text .= "\n";
    }
    break;

    // Set font name and size.
    case 'Tf':
    $text.= ' ';
    break;

    // Display text, allowing individual character positioning
    case 'TJ':
    $start = mb_strpos($command, '[', null, 'UTF-8') + 1;
    $end = mb_strrpos($command, ']', null, 'UTF-8');
    $text.= self::parseTextCommand(mb_substr($command, $start, $end - $start, 'UTF-8'));
    break;

    // Display text.
    case 'Tj':
    $start = mb_strpos($command, '(', null, 'UTF-8') + 1;
    $end = mb_strrpos($command, ')', null, 'UTF-8');
    $text.= mb_substr($command, $start, $end - $start, 'UTF-8'); // Removes round brackets
    break;

    // Set leading.
    case 'TL':

    // Set text matrix.
    case 'Tm':
    // $text.= ' ';
    break;

    // Set text rendering mode.
    case 'Tr':
    break;

    // Set super/subscripting text rise.
    case 'Ts':
    break;

    // Set text spacing.
    case 'Tw':
    break;

    // Set horizontal scaling.
    case 'Tz':
    break;

    // Move to start of next line.
    case 'T*':
    $text.= "\n";
    break;

    // Internal use
    case 'g':
    case 'gs':
    case 're':
    case 'f':
    // Begin text
    case 'BT':
    // End text
    case 'ET':
    break;

    case '':
    break;

    default:
    }
    }

    $text = str_replace(array('\\(', '\\)'), array('(', ')'), $text);

    return $text;
    }

    /**
    * Strip out the text from a small chunk of data.
    *
    * @param string $text
    * @param int $font_size Currently not used
    *
    * @return string
    */
    protected static function parseTextCommand($text, $font_size = 0) {

    $result = '';
    $cur_start_pos = 0;

    while (($cur_start_text = mb_strpos($text, '(', $cur_start_pos, 'UTF-8')) !== false) {
    // New text element found
    if ($cur_start_text - $cur_start_pos > 8) {
    $spacing = ' ';
    } else {
    $spacing_size = mb_substr($text, $cur_start_pos, $cur_start_text - $cur_start_pos, 'UTF-8');

    if ($spacing_size < -50) {
    $spacing = ' ';
    } else {
    $spacing = '';
    }
    }
    $cur_start_text++;

    $start_search_end = $cur_start_text;
    while (($cur_start_pos = mb_strpos($text, ')', $start_search_end, 'UTF-8')) !== false) {
    if (mb_substr($text, $cur_start_pos - 1, 1, 'UTF-8') != '\\') {
    break;
    }
    $start_search_end = $cur_start_pos + 1;
    }

    // something wrong happened
    if ($cur_start_pos === false) {
    break;
    }

    // Add to result
    $result .= $spacing . mb_substr($text, $cur_start_text, $cur_start_pos - $cur_start_text, 'UTF-8');
    $cur_start_pos++;
    }

    return $result;
    }

    /**
    * Convert a section of data into an array, separated by the start and end words.
    *
    * @param string $data The data.
    * @param string $start_word The start of each section of data.
    * @param string $end_word The end of each section of data.
    * @return array The array of data.
    */
    protected static function getDataArray($data, $start_word, $end_word)
    {
    $start = 0;
    $end = 0;
    $a_results = array();

    while ($start !== false && $end !== false) {
    $start = strpos($data, $start_word, $end);
    $end = strpos($data, $end_word, $start);

    if ($end !== false && $start !== false) {
    // data is between start and end
    $a_results[] = substr($data, $start, $end - $start + strlen($end_word));
    }
    }

    return $a_results;
    }
    }