smalot · January 6, 2025 20:04 · Aug 8, 2013 · Aug 8, 2013 · Aug 8, 2013 · Aug 8, 2013
diff --git a/PdfParser.php b/PdfParser.php
@@ -4,7 +4,7 @@
  * @file
  * Class PdfParser
  * 
- * @author : Sébastien MALOT <[email protected]>
+ * @author : Sebastien MALOT <[email protected]>
  * @date : 2013-08-08
  *
  * References :

diff --git a/PdfParser → PdfParser.php b/PdfParser → PdfParser.php
diff --git a/PdfParser b/PdfParser
@@ -3,6 +3,9 @@
 /**
  * @file
  * Class PdfParser
+ * 
+ * @author : Sébastien MALOT <[email protected]>
+ * @date : 2013-08-08
  *
  * References :
  * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html

diff --git a/PdfParser b/PdfParser
@@ -0,0 +1,334 @@
+<?php
+
+/**
+ * @file
+ * Class PdfParser
+ *
+ * References :
+ * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
+ * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
+ * - http://www.php.net/manual/en/ref.pdf.php#74211
+ */
+class PdfParser
+{
+  /**
+   * Parse PDF file
+   *
+   * @param string $filename
+   * @return string
+   */
+  public static function parseFile($filename)
+  {
+    $content = file_get_contents($filename);
+
+    return self::extractText($content);
+  }
+
+  /**
+   * Parse PDF content
+   *
+   * @param string $content
+   * @return string
+   */
+  public static function parseContent($content)
+  {
+    return self::extractText($content);
+  }
+
+  /**
+   * Convert a PDF into text.
+   *
+   * @param string $filename The filename to extract the data from.
+   * @return string The extracted text from the PDF
+   */
+  protected static function extractText($data)
+  {
+    /**
+     * Split apart the PDF document into sections. We will address each
+     * section separately.
+     */
+    $a_obj    = self::getDataArray($data, 'obj', 'endobj');
+    $j        = 0;
+    $a_chunks = array();
+
+    /**
+     * Attempt to extract each part of the PDF document into a 'filter'
+     * element and a 'data' element. This can then be used to decode the
+     * data.
+     */
+    foreach ($a_obj as $obj) {
+      $a_filter = self::getDataArray($obj, '<<', '>>');
+
+      if (is_array($a_filter) && isset($a_filter[0])) {
+        $a_chunks[$j]['filter'] = $a_filter[0];
+        $a_data = self::getDataArray($obj, 'stream', 'endstream');
+
+        if (is_array($a_data) && isset($a_data[0])) {
+          $a_chunks[$j]['data'] = trim(substr($a_data[0], strlen('stream'), strlen($a_data[0]) - strlen('stream') - strlen('endstream')));
+        }
+
+        $j++;
+      }
+    }
+
+    $result_data = null;
+
+    // decode the chunks
+    foreach ($a_chunks as $chunk) {
+      // Look at each chunk decide if we can decode it by looking at the contents of the filter
+      if (isset($chunk['data'])) {
+
+        // look at the filter to find out which encoding has been used
+        if (strpos($chunk['filter'], 'FlateDecode') !== false) {
+          // Use gzuncompress but suppress error messages.
+          $data =@ gzuncompress($chunk['data']);
+        } else {
+          $data = $chunk['data'];
+        }
+
+        if (trim($data) != '') {
+          // If we got data then attempt to extract it.
+          $result_data .= ' ' . self::extractTextElements($data);
+        }
+      }
+    }
+
+    /**
+     * Make sure we don't have large blocks of white space before and after
+     * our string. Also extract alphanumerical information to reduce
+     * redundant data.
+     */
+    if (trim($result_data) == '') {
+      return null;
+    } else {
+      // Optimize hyphened words
+      $result_data = preg_replace('/\s*-[\r\n]+\s*/', '', $result_data);
+      $result_data = preg_replace('/\s+/', ' ', $result_data);
+
+      return $result_data;
+    }
+  }
+
+  protected static function extractTextElements($content)
+  {
+    if (strpos($content, '/CIDInit') === 0) {
+      return '';
+    }
+
+    $text  = '';
+    $lines = explode("\n", $content);
+
+    foreach ($lines as $line) {
+      $line = trim($line);
+      $matches = array();
+
+      // Parse each lines to extract command and operator values
+      if (preg_match('/^(?<command>.*[\)\] ])(?<operator>[a-z]+[\*]?)$/i', $line, $matches)) {
+        $command = trim($matches['command']);
+
+        // Convert octal encoding
+        $found_octal_values = array();
+        preg_match_all('/\\\\([0-9]{3})/', $command, $found_octal_values);
+
+        foreach($found_octal_values[0] as $value) {
+          $octal = substr($value, 1);
+
+          if (intval($octal) < 40) {
+            // Skips non printable chars
+            $command = str_replace($value, '', $command);
+          } else {
+            $command = str_replace($value, chr(octdec($octal)), $command);
+          }
+        }
+        // Removes encoded new lines, tabs, ...
+        $command = preg_replace('/\\\\[\r\n]/', '', $command);
+        $command = preg_replace('/\\\\[rnftb ]/', ' ', $command);
+        // Force UTF-8 charset
+        $encoding = mb_detect_encoding($command, array('ASCII', 'UTF-8', 'Windows-1252', 'ISO-8859-1'));
+        if (strtoupper($encoding) != 'UTF-8') {
+          if ($decoded = @iconv('CP1252', 'UTF-8//TRANSLIT//IGNORE', $command)) {
+            $command = $decoded;
+          }
+        }
+        // Removes leading spaces
+        $operator = trim($matches['operator']);
+      } else {
+        $command = $line;
+        $operator = '';
+      }
+
+      // Handle main operators
+      switch ($operator) {
+        // Set character spacing.
+        case 'Tc':
+          break;
+
+        // Move text current point.
+        case 'Td':
+          $values = explode(' ', $command);
+          $y = array_pop($values);
+          $x = array_pop($values);
+          if ($x > 0) {
+            $text .= ' ';
+          }
+          if ($y < 0) {
+            $text .= ' ';
+          }
+          break;
+
+        // Move text current point and set leading.
+        case 'TD':
+          $values = explode(' ', $command);
+          $y = array_pop($values);
+          if ($y < 0) {
+            $text .= "\n";
+          }
+          break;
+
+        // Set font name and size.
+        case 'Tf':
+          $text.= ' ';
+          break;
+
+        // Display text, allowing individual character positioning
+        case 'TJ':
+          $start = mb_strpos($command, '[', null, 'UTF-8') + 1;
+          $end   = mb_strrpos($command, ']', null, 'UTF-8');
+          $text.= self::parseTextCommand(mb_substr($command, $start, $end - $start, 'UTF-8'));
+          break;
+
+        // Display text.
+        case 'Tj':
+          $start = mb_strpos($command, '(', null, 'UTF-8') + 1;
+          $end   = mb_strrpos($command, ')', null, 'UTF-8');
+          $text.= mb_substr($command, $start, $end - $start, 'UTF-8'); // Removes round brackets
+          break;
+
+        // Set leading.
+        case 'TL':
+
+        // Set text matrix.
+        case 'Tm':
+//          $text.= ' ';
+          break;
+
+        // Set text rendering mode.
+        case 'Tr':
+          break;
+
+        // Set super/subscripting text rise.
+        case 'Ts':
+          break;
+
+        // Set text spacing.
+        case 'Tw':
+          break;
+
+        // Set horizontal scaling.
+        case 'Tz':
+          break;
+
+        // Move to start of next line.
+        case 'T*':
+          $text.= "\n";
+          break;
+
+        // Internal use
+        case 'g':
+        case 'gs':
+        case 're':
+        case 'f':
+        // Begin text
+        case 'BT':
+        // End text
+        case 'ET':
+          break;
+
+        case '':
+          break;
+
+        default:
+      }
+    }
+
+    $text = str_replace(array('\\(', '\\)'), array('(', ')'), $text);
+
+    return $text;
+  }
+
+  /**
+   * Strip out the text from a small chunk of data.
+   *
+   * @param string $text
+   * @param int $font_size Currently not used
+   *
+   * @return string
+   */
+  protected static function parseTextCommand($text, $font_size = 0) {
+
+    $result = '';
+    $cur_start_pos = 0;
+
+    while (($cur_start_text = mb_strpos($text, '(', $cur_start_pos, 'UTF-8')) !== false) {
+      // New text element found
+      if ($cur_start_text - $cur_start_pos > 8) {
+        $spacing = ' ';
+      } else {
+        $spacing_size = mb_substr($text, $cur_start_pos, $cur_start_text - $cur_start_pos, 'UTF-8');
+
+        if ($spacing_size < -50) {
+          $spacing = ' ';
+        } else {
+          $spacing = '';
+        }
+      }
+      $cur_start_text++;
+
+      $start_search_end = $cur_start_text;
+      while (($cur_start_pos = mb_strpos($text, ')', $start_search_end, 'UTF-8')) !== false) {
+        if (mb_substr($text, $cur_start_pos - 1, 1, 'UTF-8') != '\\') {
+          break;
+        }
+        $start_search_end = $cur_start_pos + 1;
+      }
+
+      // something wrong happened
+      if ($cur_start_pos === false) {
+        break;
+      }
+
+      // Add to result
+      $result .= $spacing . mb_substr($text, $cur_start_text, $cur_start_pos - $cur_start_text, 'UTF-8');
+      $cur_start_pos++;
+    }
+
+    return $result;
+  }
+
+  /**
+   * Convert a section of data into an array, separated by the start and end words.
+   *
+   * @param  string $data       The data.
+   * @param  string $start_word The start of each section of data.
+   * @param  string $end_word   The end of each section of data.
+   * @return array              The array of data.
+   */
+  protected static function getDataArray($data, $start_word, $end_word)
+  {
+    $start     = 0;
+    $end       = 0;
+    $a_results = array();
+
+    while ($start !== false && $end !== false) {
+      $start = strpos($data, $start_word, $end);
+      $end   = strpos($data, $end_word, $start);
+
+      if ($end !== false && $start !== false) {
+        // data is between start and end
+        $a_results[] = substr($data, $start, $end - $start + strlen($end_word));
+      }
+    }
+
+    return $a_results;
+  }
+}