Skip to content

Instantly share code, notes, and snippets.

@VehpuS
Forked from jackreichert/document.xml
Created August 5, 2014 18:05
Show Gist options
  • Save VehpuS/5539c4f09a08373400f5 to your computer and use it in GitHub Desktop.
Save VehpuS/5539c4f09a08373400f5 to your computer and use it in GitHub Desktop.

Revisions

  1. Jackreichert revised this gist Nov 12, 2012. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion docx2html
    Original file line number Diff line number Diff line change
    @@ -48,7 +48,7 @@
    $text .= (($formatting['bold'] == 'open') ? '<strong>' : '').
    (($formatting['italic'] == 'open') ? '<em>' : '').
    (($formatting['underline'] == 'open') ? '<u>' : '').
    $paragraph->expand()->textContent.
    htmlentities(iconv('UTF-8', 'ASCII//TRANSLIT',$paragraph->expand()->textContent)).
    (($formatting['underline'] == 'close') ? '</u>' : '').
    (($formatting['italic'] == 'close') ? '</em>' : '').
    (($formatting['bold'] == 'close') ? '</strong>' : '');
  2. Jackreichert revised this gist Nov 10, 2012. 1 changed file with 1 addition and 3 deletions.
    4 changes: 1 addition & 3 deletions docx2html
    Original file line number Diff line number Diff line change
    @@ -67,10 +67,8 @@
    $reader->close();

    // suppress warnings. loadHTML does not require valid HTML but still warns against it...
    error_reporting(0);

    // fix invalid html
    $doc = new DOMDocument();
    $doc->encoding = 'UTF-8';
    $doc->loadHTML($text);
    @$doc->loadHTML($text);
    $goodHTML = simplexml_import_dom($doc)->asXML();
  3. Jackreichert revised this gist Nov 10, 2012. 1 changed file with 3 additions and 0 deletions.
    3 changes: 3 additions & 0 deletions docx2html
    Original file line number Diff line number Diff line change
    @@ -66,6 +66,9 @@
    }
    $reader->close();

    // suppress warnings. loadHTML does not require valid HTML but still warns against it...
    error_reporting(0);

    // fix invalid html
    $doc = new DOMDocument();
    $doc->encoding = 'UTF-8';
  4. Jackreichert created this gist Nov 10, 2012.
    73 changes: 73 additions & 0 deletions docx2html
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,73 @@
    // set location of docx text content file
    $xmlFile = $targetDir."/word/document.xml";
    $reader = new XMLReader;
    $reader->open($xmlFile);

    // set up variables for formatting
    $text = ''; $formatting['bold'] = 'closed'; $formatting['italic'] = 'closed'; $formatting['underline'] = 'closed'; $formatting['header'] = 0;

    // loop through docx xml dom
    while ($reader->read()){
    // look for new paragraphs
    if ($reader->nodeType == XMLREADER::ELEMENT && $reader->name === 'w:p'){
    // set up new instance of XMLReader for parsing paragraph independantly
    $paragraph = new XMLReader;
    $p = $reader->readOuterXML();
    $paragraph->xml($p);

    // search for heading
    preg_match('/<w:pStyle w:val="(Heading.*?[1-6])"/',$p,$matches);
    switch($matches[1]){
    case 'Heading1': $formatting['header'] = 1; break;
    case 'Heading2': $formatting['header'] = 2; break;
    case 'Heading3': $formatting['header'] = 3; break;
    case 'Heading4': $formatting['header'] = 4; break;
    case 'Heading5': $formatting['header'] = 5; break;
    case 'Heading6': $formatting['header'] = 6; break;
    default: $formatting['header'] = 0; break;
    }

    // open h-tag or paragraph
    $text .= ($formatting['header'] > 0) ? '<h'.$formatting['header'].'>' : '<p>';

    // loop through paragraph dom
    while ($paragraph->read()){
    // look for elements
    if ($paragraph->nodeType == XMLREADER::ELEMENT && $paragraph->name === 'w:r'){
    $node = trim($paragraph->readInnerXML());

    // add <br> tags
    if (strstr($node,'<w:br ')) $text .= '<br>';

    // look for formatting tags
    $formatting['bold'] = (strstr($node,'<w:b/>')) ? (($formatting['bold'] == 'closed') ? 'open' : $formatting['bold']) : (($formatting['bold'] == 'opened') ? 'close' : $formatting['bold']);
    $formatting['italic'] = (strstr($node,'<w:i/>')) ? (($formatting['italic'] == 'closed') ? 'open' : $formatting['italic']) : (($formatting['italic'] == 'opened') ? 'close' : $formatting['italic']);
    $formatting['underline'] = (strstr($node,'<w:u ')) ? (($formatting['underline'] == 'closed') ? 'open' : $formatting['underline']) : (($formatting['underline'] == 'opened') ? 'close' : $formatting['underline']);

    // build text string of doc
    $text .= (($formatting['bold'] == 'open') ? '<strong>' : '').
    (($formatting['italic'] == 'open') ? '<em>' : '').
    (($formatting['underline'] == 'open') ? '<u>' : '').
    $paragraph->expand()->textContent.
    (($formatting['underline'] == 'close') ? '</u>' : '').
    (($formatting['italic'] == 'close') ? '</em>' : '').
    (($formatting['bold'] == 'close') ? '</strong>' : '');

    // reset formatting variables
    foreach ($formatting as $key=>$format){
    if ($format == 'open') $formatting[$key] = 'opened';
    if ($format == 'close') $formatting[$key] = 'closed';
    }
    }
    }
    $text .= ($formatting['header'] > 0) ? '</h'.$formatting['header'].'>' : '</p>';
    }

    }
    $reader->close();

    // fix invalid html
    $doc = new DOMDocument();
    $doc->encoding = 'UTF-8';
    $doc->loadHTML($text);
    $goodHTML = simplexml_import_dom($doc)->asXML();