Moodle  2.2.1
http://www.collinsharper.com
C:/xampp/htdocs/moodle/lib/simpletestlib/parser.php
Go to the documentation of this file.
00001 <?php
00012 foreach (array('LEXER_ENTER', 'LEXER_MATCHED',
00013                 'LEXER_UNMATCHED', 'LEXER_EXIT',
00014                 'LEXER_SPECIAL') as $i => $constant) {
00015     if (! defined($constant)) {
00016         define($constant, $i + 1);
00017     }
00018 }
00028 class ParallelRegex {
00029     var $_patterns;
00030     var $_labels;
00031     var $_regex;
00032     var $_case;
00033     
00040     function ParallelRegex($case) {
00041         $this->_case = $case;
00042         $this->_patterns = array();
00043         $this->_labels = array();
00044         $this->_regex = null;
00045     }
00046     
00055     function addPattern($pattern, $label = true) {
00056         $count = count($this->_patterns);
00057         $this->_patterns[$count] = $pattern;
00058         $this->_labels[$count] = $label;
00059         $this->_regex = null;
00060     }
00061     
00071     function match($subject, &$match) {
00072         if (count($this->_patterns) == 0) {
00073             return false;
00074         }
00075         if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
00076             $match = '';
00077             return false;
00078         }
00079         $match = $matches[0];
00080         for ($i = 1; $i < count($matches); $i++) {
00081             if ($matches[$i]) {
00082                 return $this->_labels[$i - 1];
00083             }
00084         }
00085         return true;
00086     }
00087     
00096     function _getCompoundedRegex() {
00097         if ($this->_regex == null) {
00098             for ($i = 0, $count = count($this->_patterns); $i < $count; $i++) {
00099                 $this->_patterns[$i] = '(' . str_replace(
00100                         array('/', '(', ')'),
00101                         array('\/', '\(', '\)'),
00102                         $this->_patterns[$i]) . ')';
00103             }
00104             $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags();
00105         }
00106         return $this->_regex;
00107     }
00108     
00114     function _getPerlMatchingFlags() {
00115         return ($this->_case ? "msS" : "msSi");
00116     }
00117 }
00118 
00124 class SimpleStateStack {
00125     var $_stack;
00126     
00132     function SimpleStateStack($start) {
00133         $this->_stack = array($start);
00134     }
00135     
00141     function getCurrent() {
00142         return $this->_stack[count($this->_stack) - 1];
00143     }
00144     
00151     function enter($state) {
00152         array_push($this->_stack, $state);
00153     }
00154     
00162     function leave() {
00163         if (count($this->_stack) == 1) {
00164             return false;
00165         }
00166         array_pop($this->_stack);
00167         return true;
00168     }
00169 }
00170 
00180 class SimpleLexer {
00181     var $_regexes;
00182     var $_parser;
00183     var $_mode;
00184     var $_mode_handlers;
00185     var $_case;
00186     
00196     function SimpleLexer(&$parser, $start = "accept", $case = false) {
00197         $this->_case = $case;
00198         $this->_regexes = array();
00199         $this->_parser = &$parser;
00200         $this->_mode = new SimpleStateStack($start);
00201         $this->_mode_handlers = array($start => $start);
00202     }
00203     
00215     function addPattern($pattern, $mode = "accept") {
00216         if (! isset($this->_regexes[$mode])) {
00217             $this->_regexes[$mode] = new ParallelRegex($this->_case);
00218         }
00219         $this->_regexes[$mode]->addPattern($pattern);
00220         if (! isset($this->_mode_handlers[$mode])) {
00221             $this->_mode_handlers[$mode] = $mode;
00222         }
00223     }
00224     
00238     function addEntryPattern($pattern, $mode, $new_mode) {
00239         if (! isset($this->_regexes[$mode])) {
00240             $this->_regexes[$mode] = new ParallelRegex($this->_case);
00241         }
00242         $this->_regexes[$mode]->addPattern($pattern, $new_mode);
00243         if (! isset($this->_mode_handlers[$new_mode])) {
00244             $this->_mode_handlers[$new_mode] = $new_mode;
00245         }
00246     }
00247     
00256     function addExitPattern($pattern, $mode) {
00257         if (! isset($this->_regexes[$mode])) {
00258             $this->_regexes[$mode] = new ParallelRegex($this->_case);
00259         }
00260         $this->_regexes[$mode]->addPattern($pattern, "__exit");
00261         if (! isset($this->_mode_handlers[$mode])) {
00262             $this->_mode_handlers[$mode] = $mode;
00263         }
00264     }
00265     
00278     function addSpecialPattern($pattern, $mode, $special) {
00279         if (! isset($this->_regexes[$mode])) {
00280             $this->_regexes[$mode] = new ParallelRegex($this->_case);
00281         }
00282         $this->_regexes[$mode]->addPattern($pattern, "_$special");
00283         if (! isset($this->_mode_handlers[$special])) {
00284             $this->_mode_handlers[$special] = $special;
00285         }
00286     }
00287     
00294     function mapHandler($mode, $handler) {
00295         $this->_mode_handlers[$mode] = $handler;
00296     }
00297     
00308     function parse($raw) {
00309         if (! isset($this->_parser)) {
00310             return false;
00311         }
00312         $length = strlen($raw);
00313         while (is_array($parsed = $this->_reduce($raw))) {
00314             list($raw, $unmatched, $matched, $mode) = $parsed;
00315             if (! $this->_dispatchTokens($unmatched, $matched, $mode)) {
00316                 return false;
00317             }
00318             if ($raw === '') {
00319                 return true;
00320             }
00321             if (strlen($raw) == $length) {
00322                 return false;
00323             }
00324             $length = strlen($raw);
00325         }
00326         if (! $parsed) {
00327             return false;
00328         }
00329         return $this->_invokeParser($raw, LEXER_UNMATCHED);
00330     }
00331     
00344     function _dispatchTokens($unmatched, $matched, $mode = false) {
00345         if (! $this->_invokeParser($unmatched, LEXER_UNMATCHED)) {
00346             return false;
00347         }
00348         if (is_bool($mode)) {
00349             return $this->_invokeParser($matched, LEXER_MATCHED);
00350         }
00351         if ($this->_isModeEnd($mode)) {
00352             if (! $this->_invokeParser($matched, LEXER_EXIT)) {
00353                 return false;
00354             }
00355             return $this->_mode->leave();
00356         }
00357         if ($this->_isSpecialMode($mode)) {
00358             $this->_mode->enter($this->_decodeSpecial($mode));
00359             if (! $this->_invokeParser($matched, LEXER_SPECIAL)) {
00360                 return false;
00361             }
00362             return $this->_mode->leave();
00363         }
00364         $this->_mode->enter($mode);
00365         return $this->_invokeParser($matched, LEXER_ENTER);
00366     }
00367     
00376     function _isModeEnd($mode) {
00377         return ($mode === "__exit");
00378     }
00379     
00388     function _isSpecialMode($mode) {
00389         return (strncmp($mode, "_", 1) == 0);
00390     }
00391     
00399     function _decodeSpecial($mode) {
00400         return substr($mode, 1);
00401     }
00402     
00412     function _invokeParser($content, $is_match) {
00413         if (($content === '') || ($content === false)) {
00414             return true;
00415         }
00416         $handler = $this->_mode_handlers[$this->_mode->getCurrent()];
00417         return $this->_parser->$handler($content, $is_match);
00418     }
00419     
00434     function _reduce($raw) {
00435         if ($action = $this->_regexes[$this->_mode->getCurrent()]->match($raw, $match)) {
00436             $unparsed_character_count = strpos($raw, $match);
00437             $unparsed = substr($raw, 0, $unparsed_character_count);
00438             $raw = substr($raw, $unparsed_character_count + strlen($match));
00439             return array($raw, $unparsed, $match, $action);
00440         }
00441         return true;
00442     }
00443 }
00444 
00450 class SimpleHtmlLexer extends SimpleLexer {
00451     
00459     function SimpleHtmlLexer(&$parser) {
00460         $this->SimpleLexer($parser, 'text');
00461         $this->mapHandler('text', 'acceptTextToken');
00462         $this->_addSkipping();
00463         foreach ($this->_getParsedTags() as $tag) {
00464             $this->_addTag($tag);
00465         }
00466         $this->_addInTagTokens();
00467     }
00468     
00474     function _getParsedTags() {
00475         return array('a', 'base', 'title', 'form', 'input', 'button', 'textarea', 'select',
00476                 'option', 'frameset', 'frame', 'label');
00477     }
00478     
00484     function _addSkipping() {
00485         $this->mapHandler('css', 'ignore');
00486         $this->addEntryPattern('<style', 'text', 'css');
00487         $this->addExitPattern('</style>', 'css');
00488         $this->mapHandler('js', 'ignore');
00489         $this->addEntryPattern('<script', 'text', 'js');
00490         $this->addExitPattern('</script>', 'js');
00491         $this->mapHandler('comment', 'ignore');
00492         $this->addEntryPattern('<!--', 'text', 'comment');
00493         $this->addExitPattern('-->', 'comment');
00494     }
00495     
00501     function _addTag($tag) {
00502         $this->addSpecialPattern("</$tag>", 'text', 'acceptEndToken');
00503         $this->addEntryPattern("<$tag", 'text', 'tag');
00504     }
00505     
00511     function _addInTagTokens() {
00512         $this->mapHandler('tag', 'acceptStartToken');
00513         $this->addSpecialPattern('\s+', 'tag', 'ignore');
00514         $this->_addAttributeTokens();
00515         $this->addExitPattern('/>', 'tag');
00516         $this->addExitPattern('>', 'tag');
00517     }
00518     
00524     function _addAttributeTokens() {
00525         $this->mapHandler('dq_attribute', 'acceptAttributeToken');
00526         $this->addEntryPattern('=\s*"', 'tag', 'dq_attribute');
00527         $this->addPattern("\\\\\"", 'dq_attribute');
00528         $this->addExitPattern('"', 'dq_attribute');
00529         $this->mapHandler('sq_attribute', 'acceptAttributeToken');
00530         $this->addEntryPattern("=\s*'", 'tag', 'sq_attribute');
00531         $this->addPattern("\\\\'", 'sq_attribute');
00532         $this->addExitPattern("'", 'sq_attribute');
00533         $this->mapHandler('uq_attribute', 'acceptAttributeToken');
00534         $this->addSpecialPattern('=\s*[^>\s]*', 'tag', 'uq_attribute');
00535     }
00536 }
00537 
00543 class SimpleHtmlSaxParser {
00544     var $_lexer;
00545     var $_listener;
00546     var $_tag;
00547     var $_attributes;
00548     var $_current_attribute;
00549     
00555     function SimpleHtmlSaxParser(&$listener) {
00556         $this->_listener = &$listener;
00557         $this->_lexer = &$this->createLexer($this);
00558         $this->_tag = '';
00559         $this->_attributes = array();
00560         $this->_current_attribute = '';
00561     }
00562     
00570     function parse($raw) {
00571         return $this->_lexer->parse($raw);
00572     }
00573     
00581     function &createLexer(&$parser) {
00582         $lexer = new SimpleHtmlLexer($parser);
00583         return $lexer;
00584     }
00585     
00597     function acceptStartToken($token, $event) {
00598         if ($event == LEXER_ENTER) {
00599             $this->_tag = strtolower(substr($token, 1));
00600             return true;
00601         }
00602         if ($event == LEXER_EXIT) {
00603             $success = $this->_listener->startElement(
00604                     $this->_tag,
00605                     $this->_attributes);
00606             $this->_tag = '';
00607             $this->_attributes = array();
00608             return $success;
00609         }
00610         if ($token != '=') {
00611             $this->_current_attribute = strtolower(SimpleHtmlSaxParser::decodeHtml($token));
00612             $this->_attributes[$this->_current_attribute] = '';
00613         }
00614         return true;
00615     }
00616     
00625     function acceptEndToken($token, $event) {
00626         if (! preg_match('/<\/(.*)>/', $token, $matches)) {
00627             return false;
00628         }
00629         return $this->_listener->endElement(strtolower($matches[1]));
00630     }
00631     
00639     function acceptAttributeToken($token, $event) {
00640         if ($this->_current_attribute) {
00641             if ($event == LEXER_UNMATCHED) {
00642                 $this->_attributes[$this->_current_attribute] .=
00643                         SimpleHtmlSaxParser::decodeHtml($token);
00644             }
00645             if ($event == LEXER_SPECIAL) {
00646                 $this->_attributes[$this->_current_attribute] .=
00647                         preg_replace('/^=\s*/' , '', SimpleHtmlSaxParser::decodeHtml($token));
00648             }
00649         }
00650         return true;
00651     }
00652     
00660     function acceptEntityToken($token, $event) {
00661     }
00662     
00671     function acceptTextToken($token, $event) {
00672         return $this->_listener->addContent($token);
00673     }
00674     
00682     function ignore($token, $event) {
00683         return true;
00684     }
00685     
00693     function decodeHtml($html) {
00694         return html_entity_decode($html, ENT_QUOTES);
00695     }
00696     
00706     function normalise($html) {
00707         $text = preg_replace('|<!--.*?-->|', '', $html);
00708         $text = preg_replace('|<script[^>]*>.*?</script>|', '', $text);
00709         $text = preg_replace('|<img[^>]*alt\s*=\s*"([^"]*)"[^>]*>|', ' \1 ', $text);
00710         $text = preg_replace('|<img[^>]*alt\s*=\s*\'([^\']*)\'[^>]*>|', ' \1 ', $text);
00711         $text = preg_replace('|<img[^>]*alt\s*=\s*([a-zA-Z_]+)[^>]*>|', ' \1 ', $text);
00712         $text = preg_replace('|<[^>]*>|', '', $text);
00713         $text = SimpleHtmlSaxParser::decodeHtml($text);
00714         $text = preg_replace('|\s+|', ' ', $text);
00715         return trim(trim($text), "\xA0");        // TODO: The \xAO is a &nbsp;. Add a test for this.
00716     }
00717 }
00718 
00725 class SimpleSaxListener {
00726     
00731     function SimpleSaxListener() {
00732     }
00733     
00743     function startElement($name, $attributes) {
00744     }
00745     
00752     function endElement($name) {
00753     }
00754     
00761     function addContent($text) {
00762     }
00763 }
00764 ?>
 All Data Structures Namespaces Files Functions Variables Enumerations