Skip to content

Instantly share code, notes, and snippets.

@ngugijames
Created January 11, 2016 10:44
Show Gist options
  • Select an option

  • Save ngugijames/9e9a5842f2d57ec944b1 to your computer and use it in GitHub Desktop.

Select an option

Save ngugijames/9e9a5842f2d57ec944b1 to your computer and use it in GitHub Desktop.

Revisions

  1. ngugijames created this gist Jan 11, 2016.
    157 changes: 157 additions & 0 deletions rtf2text.php
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,157 @@
    /**
    * Special thanks to http://webcheatsheet.com/php/reading_the_clean_text_from_rtf.php
    */
    <?php

    // Function that checks whether the data are the on-screen text.
    // It works in the following way:
    // an array arrfailAt stores the control words for the current state of the stack, which show that
    // input data are something else than plain text.
    // For example, there may be a description of font or color palette etc.
    function rtf_isPlainText($s) {
    $arrfailAt = array("*", "fonttbl", "colortbl", "datastore", "themedata");
    for ($i = 0; $i < count($arrfailAt); $i++)
    if (!empty($s[$arrfailAt[$i]])) return false;
    return true;
    }

    function rtf2text($filename) {
    // Read the data from the input file.
    $text = file_get_contents($filename);
    if (!strlen($text))
    return "";

    // Create empty stack array.
    $document = "";
    $stack = array();
    $j = -1;
    // Read the data character-by- character…
    for ($i = 0, $len = strlen($text); $i < $len; $i++) {
    $c = $text[$i];

    // Depending on current character select the further actions.
    switch ($c) {
    // the most important key word backslash
    case "\\":
    // read next character
    $nc = $text[$i + 1];

    // If it is another backslash or nonbreaking space or hyphen,
    // then the character is plain text and add it to the output stream.
    if ($nc == '\\' && rtf_isPlainText($stack[$j])) $document .= '\\';
    elseif ($nc == '~' && rtf_isPlainText($stack[$j])) $document .= ' ';
    elseif ($nc == '_' && rtf_isPlainText($stack[$j])) $document .= '-';
    // If it is an asterisk mark, add it to the stack.
    elseif ($nc == '*') $stack[$j]["*"] = true;
    // If it is a single quote, read next two characters that are the hexadecimal notation
    // of a character we should add to the output stream.
    elseif ($nc == "'") {
    $hex = substr($text, $i + 2, 2);
    if (rtf_isPlainText($stack[$j]))
    $document .= html_entity_decode("&#".hexdec($hex).";");
    //Shift the pointer.
    $i += 2;
    // Since, we’ve found the alphabetic character, the next characters are control word
    // and, possibly, some digit parameter.
    } elseif ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
    $word = "";
    $param = null;

    // Start reading characters after the backslash.
    for ($k = $i + 1, $m = 0; $k < strlen($text); $k++, $m++) {
    $nc = $text[$k];
    // If the current character is a letter and there were no digits before it,
    // then we’re still reading the control word. If there were digits, we should stop
    // since we reach the end of the control word.
    if ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
    if (empty($param))
    $word .= $nc;
    else
    break;
    // If it is a digit, store the parameter.
    } elseif ($nc >= '0' && $nc <= '9')
    $param .= $nc;
    // Since minus sign may occur only before a digit parameter, check whether
    // $param is empty. Otherwise, we reach the end of the control word.
    elseif ($nc == '-') {
    if (empty($param))
    $param .= $nc;
    else
    break;
    } else
    break;
    }
    // Shift the pointer on the number of read characters.
    $i += $m - 1;

    // Start analyzing what we’ve read. We are interested mostly in control words.
    $toText = "";
    switch (strtolower($word)) {
    // If the control word is "u", then its parameter is the decimal notation of the
    // Unicode character that should be added to the output stream.
    // We need to check whether the stack contains \ucN control word. If it does,
    // we should remove the N characters from the output stream.
    case "u":
    $toText .= html_entity_decode("&#x".dechex($param).";");
    $ucDelta = @$stack[$j]["uc"];
    if ($ucDelta > 0)
    $i += $ucDelta;
    break;
    // Select line feeds, spaces and tabs.
    case "par": case "page": case "column": case "line": case "lbr":
    $toText .= "\n";
    break;
    case "emspace": case "enspace": case "qmspace":
    $toText .= " ";
    break;
    case "tab": $toText .= "\t"; break;
    // Add current date and time instead of corresponding labels.
    case "chdate": $toText .= date("m.d.Y"); break;
    case "chdpl": $toText .= date("l, j F Y"); break;
    case "chdpa": $toText .= date("D, j M Y"); break;
    case "chtime": $toText .= date("H:i:s"); break;
    // Replace some reserved characters to their html analogs.
    case "emdash": $toText .= html_entity_decode("&mdash;"); break;
    case "endash": $toText .= html_entity_decode("&ndash;"); break;
    case "bullet": $toText .= html_entity_decode("&#149;"); break;
    case "lquote": $toText .= html_entity_decode("&lsquo;"); break;
    case "rquote": $toText .= html_entity_decode("&rsquo;"); break;
    case "ldblquote": $toText .= html_entity_decode("&laquo;"); break;
    case "rdblquote": $toText .= html_entity_decode("&raquo;"); break;
    // Add all other to the control words stack. If a control word
    // does not include parameters, set &param to true.
    default:
    $stack[$j][strtolower($word)] = empty($param) ? true : $param;
    break;
    }
    // Add data to the output stream if required.
    if (rtf_isPlainText($stack[$j]))
    $document .= $toText;
    }

    $i++;
    break;
    // If we read the opening brace {, then new subgroup starts and we add
    // new array stack element and write the data from previous stack element to it.
    case "{":
    array_push($stack, $stack[$j++]);
    break;
    // If we read the closing brace }, then we reach the end of subgroup and should remove
    // the last stack element.
    case "}":
    array_pop($stack);
    $j--;
    break;
    // Skip “trash”.
    case '\0': case '\r': case '\f': case '\n': break;
    // Add other data to the output stream if required.
    default:
    if (rtf_isPlainText($stack[$j]))
    $document .= $c;
    break;
    }
    }
    // Return result.
    return $document;
    }
    ?>