1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
|
public static function Rtf($text) {
// Read the data from the input file.
// Create empty stack array.
$document = "";
$stack = array();
$j = -1;
// Read the data character-by- character
for ($i = 0, $len = strlen($text); $i < $len; $i++) {
$c = $text[$i];
// Depending on current character select the further actions.
switch ($c) {
// the most important key word backslash
case "\\":
// read next character
$nc = $text[$i + 1];
// If it is another backslash or nonbreaking space or hyphen,
// then the character is plain text and add it to the output stream.
if ($nc == '\\' && self::rtf_isPlainText($stack[$j])) $document .= '\\';
elseif ($nc == '~' && self::rtf_isPlainText($stack[$j])) $document .= ' ';
elseif ($nc == '_' && self::rtf_isPlainText($stack[$j])) $document .= '-';
// If it is an asterisk mark, add it to the stack.
elseif ($nc == '*') $stack[$j]["*"] = true;
// If it is a single quote, read next two characters that are the hexadecimal notation
// of a character we should add to the output stream.
elseif ($nc == "'") {
$hex = substr($text, $i + 2, 2);
if (self::rtf_isPlainText($stack[$j]))
$document .= html_entity_decode("&#".hexdec($hex).";");
//Shift the pointer.
$i += 2;
// Since, weve found the alphabetic character, the next characters are control word
// and, possibly, some digit parameter.
} elseif ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
$word = "";
$param = null;
// Start reading characters after the backslash.
for ($k = $i + 1, $m = 0; $k < strlen($text); $k++, $m++) {
$nc = $text[$k];
// If the current character is a letter and there were no digits before it,
// then were still reading the control word. If there were digits, we should stop
// since we reach the end of the control word.
if ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
if (empty($param))
$word .= $nc;
else
break;
// If it is a digit, store the parameter.
} elseif ($nc >= '0' && $nc <= '9')
$param .= $nc;
// Since minus sign may occur only before a digit parameter, check whether
// $param is empty. Otherwise, we reach the end of the control word.
elseif ($nc == '-') {
if (empty($param))
$param .= $nc;
else
break;
} else
break;
}
// Shift the pointer on the number of read characters.
$i += $m - 1;
// Start analyzing what weve read. We are interested mostly in control words.
$toText = "";
switch (strtolower($word)) {
// If the control word is "u", then its parameter is the decimal notation of the
// Unicode character that should be added to the output stream.
// We need to check whether the stack contains \ucN control word. If it does,
// we should remove the N characters from the output stream.
case "u":
$toText .= html_entity_decode("&#x".dechex($param).";");
$ucDelta = @$stack[$j]["uc"];
if ($ucDelta > 0)
$i += $ucDelta;
break;
// Select line feeds, spaces and tabs.
case "par": case "page": case "column": case "line": case "lbr":
$toText .= "<br />";
break;
case "emspace": case "enspace": case "qmspace":
$toText .= " ";
break;
case "tab": $toText .= "\t"; break;
// Add current date and time instead of corresponding labels.
case "chdate": $toText .= date("m.d.Y"); break;
case "chdpl": $toText .= date("l, j F Y"); break;
case "chdpa": $toText .= date("D, j M Y"); break;
case "chtime": $toText .= date("H:i:s"); break;
// Replace some reserved characters to their html analogs.
case "emdash": $toText .= html_entity_decode("—"); break;
case "endash": $toText .= html_entity_decode("–"); break;
case "bullet": $toText .= html_entity_decode(""); break;
case "lquote": $toText .= html_entity_decode("‘"); break;
case "rquote": $toText .= html_entity_decode("’"); break;
case "ldblquote": $toText .= html_entity_decode("«"); break;
case "rdblquote": $toText .= html_entity_decode("»"); break;
// Add all other to the control words stack. If a control word
// does not include parameters, set ¶m to true.
default:
$stack[$j][strtolower($word)] = empty($param) ? true : $param;
break;
}
// Add data to the output stream if required.
if (self::rtf_isPlainText($stack[$j]))
$document .= $toText;
}
$i++;
break;
// If we read the opening brace {, then new subgroup starts and we add
// new array stack element and write the data from previous stack element to it.
case "{":
array_push($stack, $stack[$j++]);
break;
// If we read the closing brace }, then we reach the end of subgroup and should remove
// the last stack element.
case "}":
array_pop($stack);
$j--;
break;
// Skip trash.
case '\0': case '\r': case '\f': case '\n': break;
// Add other data to the output stream if required.
default:
if (isset($stack[$j]) && self::rtf_isPlainText($stack[$j]))
$document .= $c;
break;
}
}
// Return result.
return $document;
} |
Partager