|
Moodle
2.2.1
http://www.collinsharper.com
|
00001 <?php 00012 foreach (array('LEXER_ENTER', 'LEXER_MATCHED', 00013 'LEXER_UNMATCHED', 'LEXER_EXIT', 00014 'LEXER_SPECIAL') as $i => $constant) { 00015 if (! defined($constant)) { 00016 define($constant, $i + 1); 00017 } 00018 } 00028 class ParallelRegex { 00029 var $_patterns; 00030 var $_labels; 00031 var $_regex; 00032 var $_case; 00033 00040 function ParallelRegex($case) { 00041 $this->_case = $case; 00042 $this->_patterns = array(); 00043 $this->_labels = array(); 00044 $this->_regex = null; 00045 } 00046 00055 function addPattern($pattern, $label = true) { 00056 $count = count($this->_patterns); 00057 $this->_patterns[$count] = $pattern; 00058 $this->_labels[$count] = $label; 00059 $this->_regex = null; 00060 } 00061 00071 function match($subject, &$match) { 00072 if (count($this->_patterns) == 0) { 00073 return false; 00074 } 00075 if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) { 00076 $match = ''; 00077 return false; 00078 } 00079 $match = $matches[0]; 00080 for ($i = 1; $i < count($matches); $i++) { 00081 if ($matches[$i]) { 00082 return $this->_labels[$i - 1]; 00083 } 00084 } 00085 return true; 00086 } 00087 00096 function _getCompoundedRegex() { 00097 if ($this->_regex == null) { 00098 for ($i = 0, $count = count($this->_patterns); $i < $count; $i++) { 00099 $this->_patterns[$i] = '(' . str_replace( 00100 array('/', '(', ')'), 00101 array('\/', '\(', '\)'), 00102 $this->_patterns[$i]) . ')'; 00103 } 00104 $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags(); 00105 } 00106 return $this->_regex; 00107 } 00108 00114 function _getPerlMatchingFlags() { 00115 return ($this->_case ? "msS" : "msSi"); 00116 } 00117 } 00118 00124 class SimpleStateStack { 00125 var $_stack; 00126 00132 function SimpleStateStack($start) { 00133 $this->_stack = array($start); 00134 } 00135 00141 function getCurrent() { 00142 return $this->_stack[count($this->_stack) - 1]; 00143 } 00144 00151 function enter($state) { 00152 array_push($this->_stack, $state); 00153 } 00154 00162 function leave() { 00163 if (count($this->_stack) == 1) { 00164 return false; 00165 } 00166 array_pop($this->_stack); 00167 return true; 00168 } 00169 } 00170 00180 class SimpleLexer { 00181 var $_regexes; 00182 var $_parser; 00183 var $_mode; 00184 var $_mode_handlers; 00185 var $_case; 00186 00196 function SimpleLexer(&$parser, $start = "accept", $case = false) { 00197 $this->_case = $case; 00198 $this->_regexes = array(); 00199 $this->_parser = &$parser; 00200 $this->_mode = new SimpleStateStack($start); 00201 $this->_mode_handlers = array($start => $start); 00202 } 00203 00215 function addPattern($pattern, $mode = "accept") { 00216 if (! isset($this->_regexes[$mode])) { 00217 $this->_regexes[$mode] = new ParallelRegex($this->_case); 00218 } 00219 $this->_regexes[$mode]->addPattern($pattern); 00220 if (! isset($this->_mode_handlers[$mode])) { 00221 $this->_mode_handlers[$mode] = $mode; 00222 } 00223 } 00224 00238 function addEntryPattern($pattern, $mode, $new_mode) { 00239 if (! isset($this->_regexes[$mode])) { 00240 $this->_regexes[$mode] = new ParallelRegex($this->_case); 00241 } 00242 $this->_regexes[$mode]->addPattern($pattern, $new_mode); 00243 if (! isset($this->_mode_handlers[$new_mode])) { 00244 $this->_mode_handlers[$new_mode] = $new_mode; 00245 } 00246 } 00247 00256 function addExitPattern($pattern, $mode) { 00257 if (! isset($this->_regexes[$mode])) { 00258 $this->_regexes[$mode] = new ParallelRegex($this->_case); 00259 } 00260 $this->_regexes[$mode]->addPattern($pattern, "__exit"); 00261 if (! isset($this->_mode_handlers[$mode])) { 00262 $this->_mode_handlers[$mode] = $mode; 00263 } 00264 } 00265 00278 function addSpecialPattern($pattern, $mode, $special) { 00279 if (! isset($this->_regexes[$mode])) { 00280 $this->_regexes[$mode] = new ParallelRegex($this->_case); 00281 } 00282 $this->_regexes[$mode]->addPattern($pattern, "_$special"); 00283 if (! isset($this->_mode_handlers[$special])) { 00284 $this->_mode_handlers[$special] = $special; 00285 } 00286 } 00287 00294 function mapHandler($mode, $handler) { 00295 $this->_mode_handlers[$mode] = $handler; 00296 } 00297 00308 function parse($raw) { 00309 if (! isset($this->_parser)) { 00310 return false; 00311 } 00312 $length = strlen($raw); 00313 while (is_array($parsed = $this->_reduce($raw))) { 00314 list($raw, $unmatched, $matched, $mode) = $parsed; 00315 if (! $this->_dispatchTokens($unmatched, $matched, $mode)) { 00316 return false; 00317 } 00318 if ($raw === '') { 00319 return true; 00320 } 00321 if (strlen($raw) == $length) { 00322 return false; 00323 } 00324 $length = strlen($raw); 00325 } 00326 if (! $parsed) { 00327 return false; 00328 } 00329 return $this->_invokeParser($raw, LEXER_UNMATCHED); 00330 } 00331 00344 function _dispatchTokens($unmatched, $matched, $mode = false) { 00345 if (! $this->_invokeParser($unmatched, LEXER_UNMATCHED)) { 00346 return false; 00347 } 00348 if (is_bool($mode)) { 00349 return $this->_invokeParser($matched, LEXER_MATCHED); 00350 } 00351 if ($this->_isModeEnd($mode)) { 00352 if (! $this->_invokeParser($matched, LEXER_EXIT)) { 00353 return false; 00354 } 00355 return $this->_mode->leave(); 00356 } 00357 if ($this->_isSpecialMode($mode)) { 00358 $this->_mode->enter($this->_decodeSpecial($mode)); 00359 if (! $this->_invokeParser($matched, LEXER_SPECIAL)) { 00360 return false; 00361 } 00362 return $this->_mode->leave(); 00363 } 00364 $this->_mode->enter($mode); 00365 return $this->_invokeParser($matched, LEXER_ENTER); 00366 } 00367 00376 function _isModeEnd($mode) { 00377 return ($mode === "__exit"); 00378 } 00379 00388 function _isSpecialMode($mode) { 00389 return (strncmp($mode, "_", 1) == 0); 00390 } 00391 00399 function _decodeSpecial($mode) { 00400 return substr($mode, 1); 00401 } 00402 00412 function _invokeParser($content, $is_match) { 00413 if (($content === '') || ($content === false)) { 00414 return true; 00415 } 00416 $handler = $this->_mode_handlers[$this->_mode->getCurrent()]; 00417 return $this->_parser->$handler($content, $is_match); 00418 } 00419 00434 function _reduce($raw) { 00435 if ($action = $this->_regexes[$this->_mode->getCurrent()]->match($raw, $match)) { 00436 $unparsed_character_count = strpos($raw, $match); 00437 $unparsed = substr($raw, 0, $unparsed_character_count); 00438 $raw = substr($raw, $unparsed_character_count + strlen($match)); 00439 return array($raw, $unparsed, $match, $action); 00440 } 00441 return true; 00442 } 00443 } 00444 00450 class SimpleHtmlLexer extends SimpleLexer { 00451 00459 function SimpleHtmlLexer(&$parser) { 00460 $this->SimpleLexer($parser, 'text'); 00461 $this->mapHandler('text', 'acceptTextToken'); 00462 $this->_addSkipping(); 00463 foreach ($this->_getParsedTags() as $tag) { 00464 $this->_addTag($tag); 00465 } 00466 $this->_addInTagTokens(); 00467 } 00468 00474 function _getParsedTags() { 00475 return array('a', 'base', 'title', 'form', 'input', 'button', 'textarea', 'select', 00476 'option', 'frameset', 'frame', 'label'); 00477 } 00478 00484 function _addSkipping() { 00485 $this->mapHandler('css', 'ignore'); 00486 $this->addEntryPattern('<style', 'text', 'css'); 00487 $this->addExitPattern('</style>', 'css'); 00488 $this->mapHandler('js', 'ignore'); 00489 $this->addEntryPattern('<script', 'text', 'js'); 00490 $this->addExitPattern('</script>', 'js'); 00491 $this->mapHandler('comment', 'ignore'); 00492 $this->addEntryPattern('<!--', 'text', 'comment'); 00493 $this->addExitPattern('-->', 'comment'); 00494 } 00495 00501 function _addTag($tag) { 00502 $this->addSpecialPattern("</$tag>", 'text', 'acceptEndToken'); 00503 $this->addEntryPattern("<$tag", 'text', 'tag'); 00504 } 00505 00511 function _addInTagTokens() { 00512 $this->mapHandler('tag', 'acceptStartToken'); 00513 $this->addSpecialPattern('\s+', 'tag', 'ignore'); 00514 $this->_addAttributeTokens(); 00515 $this->addExitPattern('/>', 'tag'); 00516 $this->addExitPattern('>', 'tag'); 00517 } 00518 00524 function _addAttributeTokens() { 00525 $this->mapHandler('dq_attribute', 'acceptAttributeToken'); 00526 $this->addEntryPattern('=\s*"', 'tag', 'dq_attribute'); 00527 $this->addPattern("\\\\\"", 'dq_attribute'); 00528 $this->addExitPattern('"', 'dq_attribute'); 00529 $this->mapHandler('sq_attribute', 'acceptAttributeToken'); 00530 $this->addEntryPattern("=\s*'", 'tag', 'sq_attribute'); 00531 $this->addPattern("\\\\'", 'sq_attribute'); 00532 $this->addExitPattern("'", 'sq_attribute'); 00533 $this->mapHandler('uq_attribute', 'acceptAttributeToken'); 00534 $this->addSpecialPattern('=\s*[^>\s]*', 'tag', 'uq_attribute'); 00535 } 00536 } 00537 00543 class SimpleHtmlSaxParser { 00544 var $_lexer; 00545 var $_listener; 00546 var $_tag; 00547 var $_attributes; 00548 var $_current_attribute; 00549 00555 function SimpleHtmlSaxParser(&$listener) { 00556 $this->_listener = &$listener; 00557 $this->_lexer = &$this->createLexer($this); 00558 $this->_tag = ''; 00559 $this->_attributes = array(); 00560 $this->_current_attribute = ''; 00561 } 00562 00570 function parse($raw) { 00571 return $this->_lexer->parse($raw); 00572 } 00573 00581 function &createLexer(&$parser) { 00582 $lexer = new SimpleHtmlLexer($parser); 00583 return $lexer; 00584 } 00585 00597 function acceptStartToken($token, $event) { 00598 if ($event == LEXER_ENTER) { 00599 $this->_tag = strtolower(substr($token, 1)); 00600 return true; 00601 } 00602 if ($event == LEXER_EXIT) { 00603 $success = $this->_listener->startElement( 00604 $this->_tag, 00605 $this->_attributes); 00606 $this->_tag = ''; 00607 $this->_attributes = array(); 00608 return $success; 00609 } 00610 if ($token != '=') { 00611 $this->_current_attribute = strtolower(SimpleHtmlSaxParser::decodeHtml($token)); 00612 $this->_attributes[$this->_current_attribute] = ''; 00613 } 00614 return true; 00615 } 00616 00625 function acceptEndToken($token, $event) { 00626 if (! preg_match('/<\/(.*)>/', $token, $matches)) { 00627 return false; 00628 } 00629 return $this->_listener->endElement(strtolower($matches[1])); 00630 } 00631 00639 function acceptAttributeToken($token, $event) { 00640 if ($this->_current_attribute) { 00641 if ($event == LEXER_UNMATCHED) { 00642 $this->_attributes[$this->_current_attribute] .= 00643 SimpleHtmlSaxParser::decodeHtml($token); 00644 } 00645 if ($event == LEXER_SPECIAL) { 00646 $this->_attributes[$this->_current_attribute] .= 00647 preg_replace('/^=\s*/' , '', SimpleHtmlSaxParser::decodeHtml($token)); 00648 } 00649 } 00650 return true; 00651 } 00652 00660 function acceptEntityToken($token, $event) { 00661 } 00662 00671 function acceptTextToken($token, $event) { 00672 return $this->_listener->addContent($token); 00673 } 00674 00682 function ignore($token, $event) { 00683 return true; 00684 } 00685 00693 function decodeHtml($html) { 00694 return html_entity_decode($html, ENT_QUOTES); 00695 } 00696 00706 function normalise($html) { 00707 $text = preg_replace('|<!--.*?-->|', '', $html); 00708 $text = preg_replace('|<script[^>]*>.*?</script>|', '', $text); 00709 $text = preg_replace('|<img[^>]*alt\s*=\s*"([^"]*)"[^>]*>|', ' \1 ', $text); 00710 $text = preg_replace('|<img[^>]*alt\s*=\s*\'([^\']*)\'[^>]*>|', ' \1 ', $text); 00711 $text = preg_replace('|<img[^>]*alt\s*=\s*([a-zA-Z_]+)[^>]*>|', ' \1 ', $text); 00712 $text = preg_replace('|<[^>]*>|', '', $text); 00713 $text = SimpleHtmlSaxParser::decodeHtml($text); 00714 $text = preg_replace('|\s+|', ' ', $text); 00715 return trim(trim($text), "\xA0"); // TODO: The \xAO is a . Add a test for this. 00716 } 00717 } 00718 00725 class SimpleSaxListener { 00726 00731 function SimpleSaxListener() { 00732 } 00733 00743 function startElement($name, $attributes) { 00744 } 00745 00752 function endElement($name) { 00753 } 00754 00761 function addContent($text) { 00762 } 00763 } 00764 ?>