[ Index ]

PHP Cross Reference of Akelos Framework

title

Body

[close]

/ -> AkLexer.php (source)

   1  <?php
   2  /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
   3  
   4  // +----------------------------------------------------------------------+
   5  // | Akelos Framework - http://www.akelos.org                             |
   6  // +----------------------------------------------------------------------+
   7  // | Copyright (c) 2002-2006, Akelos Media, S.L.  & Bermi Ferrer Martinez |
   8  // | Released under the GNU Lesser General Public License, see LICENSE.txt|
   9  // +----------------------------------------------------------------------+
  10  
  11  /**
  12  * Author Markus Baker: http://www.lastcraft.com
  13  * Version adapted from Simple Test: http://sourceforge.net/projects/simpletest/
  14  * For an intro to the Lexer see:
  15  * http://www.phppatterns.com/index.php/article/articleview/106/1/2/
  16  * @author Marcus Baker
  17  * @package ActiveSupport
  18  * @subpackage GenericParser
  19  */
  20  
  21  /**#@+
  22  * lexer mode constant
  23  */
  24  define("AK_LEXER_ENTER", 1);
  25  define("AK_LEXER_MATCHED", 2);
  26  define("AK_LEXER_UNMATCHED", 3);
  27  define("AK_LEXER_EXIT", 4);
  28  define("AK_LEXER_SPECIAL", 5);
  29  /**#@-*/
  30  
  31  /**
  32   *    Compounded regular expression. Any of
  33   *    the contained patterns could match and
  34   *    when one does it's label is returned.
  35   */
  36  class AkLexerParallelRegex {
  37      var $_patterns;
  38      var $_labels;
  39      var $_regex;
  40      var $_case;
  41  
  42      /**
  43       *    Constructor. Starts with no patterns.
  44       *    @param boolean $case    True for case sensitive, false
  45       *                            for insensitive.
  46       *    @access public
  47       */
  48      function AkLexerParallelRegex($case) {
  49          $this->_case = $case;
  50          $this->_patterns = array();
  51          $this->_labels = array();
  52          $this->_regex = null;
  53      }
  54  
  55      /**
  56       *    Adds a pattern with an optional label.
  57       *    @param mixed $pattern       Perl style regex. Must be UTF-8
  58       *                                encoded. If its a string, the (, )
  59       *                                lose their meaning unless they
  60       *                                form part of a lookahead or
  61       *                                lookbehind assertation.
  62       *    @param string $label        Label of regex to be returned
  63       *                                on a match. Label must be ASCII
  64       *    @access public
  65       */
  66      function addPattern($pattern, $label = true) {
  67          $count = count($this->_patterns);
  68          $this->_patterns[$count] = $pattern;
  69          $this->_labels[$count] = $label;
  70          $this->_regex = null;
  71      }
  72  
  73      /**
  74       *    Attempts to match all patterns at once against
  75       *    a string.
  76       *    @param string $subject      String to match against.
  77       *    @param string $match        First matched portion of
  78       *                                subject.
  79       *    @return boolean             True on success.
  80       *    @access public
  81       */
  82      function match($subject, &$match) {
  83          if (count($this->_patterns) == 0) {
  84              return false;
  85          }
  86          if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
  87              $match = '';
  88              return false;
  89          }
  90  
  91          $match = $matches[0];
  92          $size = count($matches);
  93          for ($i = 1; $i < $size; $i++) {
  94              if ($matches[$i] && isset($this->_labels[$i - 1])) {
  95                  return $this->_labels[$i - 1];
  96              }
  97          }
  98          return true;
  99      }
 100  
 101      /**
 102       *    Attempts to split the string against all patterns at once
 103       *
 104       *    @param string $subject      String to match against.
 105       *    @param array $split         The split result: array containing, pre-match, match & post-match strings
 106       *    @return boolean             True on success.
 107       *    @access public
 108       *
 109       *    @author Christopher Smith <chris@jalakai.co.uk>
 110       */
 111      function split($subject, &$split) {
 112          if (count($this->_patterns) == 0) {
 113              return false;
 114          }
 115  
 116          if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
 117              $split = array($subject, "", "");
 118              return false;
 119          }
 120  
 121          $idx = count($matches)-2;
 122  
 123          list($pre, $post) = preg_split($this->_patterns[$idx].$this->_getPerlMatchingFlags(), $subject, 2);
 124  
 125          $split = array($pre, $matches[0], $post);
 126          return isset($this->_labels[$idx]) ? $this->_labels[$idx] : true;
 127      }
 128  
 129      /**
 130       *    Compounds the patterns into a single
 131       *    regular expression separated with the
 132       *    "or" operator. Caches the regex.
 133       *    Will automatically escape (, ) and / tokens.
 134       *    @param array $patterns    List of patterns in order.
 135       *    @access private
 136       */
 137      function _getCompoundedRegex() {
 138          if ($this->_regex == null) {
 139              $cnt = count($this->_patterns);
 140              for ($i = 0; $i < $cnt; $i++) {
 141  
 142                  // Replace lookaheads / lookbehinds with marker
 143                  $m = "\1\1";
 144                  $pattern = preg_replace(
 145                  array (
 146                  '/\(\?(i|m|s|x|U)\)/U',
 147                  '/\(\?(\-[i|m|s|x|U])\)/U',
 148                  '/\(\?\=(.*)\)/sU',
 149                  '/\(\?\!(.*)\)/sU',
 150                  '/\(\?\<\=(.*)\)/sU',
 151                  '/\(\?\<\!(.*)\)/sU',
 152                  '/\(\?\:(.*)\)/sU',
 153                  ),
 154                  array (
 155                  $m.'SO:\\1'.$m,
 156                  $m.'SOR:\\1'.$m,
 157                  $m.'LA:IS:\\1'.$m,
 158                  $m.'LA:NOT:\\1'.$m,
 159                  $m.'LB:IS:\\1'.$m,
 160                  $m.'LB:NOT:\\1'.$m,
 161                  $m.'GRP:\\1'.$m,
 162                  ),
 163                  $this->_patterns[$i]
 164                  );
 165                  // Quote the rest
 166                  $pattern = str_replace(
 167                  array('/', '(', ')'),
 168                  array('\/', '\(', '\)'),
 169                  $pattern
 170                  );
 171  
 172                  // Restore lookaheads / lookbehinds
 173                  $pattern = preg_replace(
 174                  array (
 175                  '/'.$m.'SO:(.{1})'.$m.'/',
 176                  '/'.$m.'SOR:(.{2})'.$m.'/',
 177                  '/'.$m.'LA:IS:(.*)'.$m.'/sU',
 178                  '/'.$m.'LA:NOT:(.*)'.$m.'/sU',
 179                  '/'.$m.'LB:IS:(.*)'.$m.'/sU',
 180                  '/'.$m.'LB:NOT:(.*)'.$m.'/sU',
 181                  '/'.$m.'GRP:(.*)'.$m.'/sU',
 182                  ),
 183                  array (
 184                  '(?\\1)',
 185                  '(?\\1)',
 186                  '(?=\\1)',
 187                  '(?!\\1)',
 188                  '(?<=\\1)',
 189                  '(?<!\\1)',
 190                  '(?:\\1)',
 191                  ),
 192                  $pattern
 193                  );
 194  
 195                  $this->_patterns[$i] = '('.$pattern.')';
 196              }
 197              $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags();
 198          }
 199          return $this->_regex;
 200      }
 201  
 202      /**
 203       *    Accessor for perl regex mode flags to use.
 204       *    @return string       Perl regex flags.
 205       *    @access private
 206       */
 207      function _getPerlMatchingFlags() {
 208          return ($this->_case ? "msS" : "msSi");
 209      }
 210  }
 211  
 212  /**
 213   *    States for a stack machine.
 214   */
 215  class AkLexerStateStack {
 216      var $_stack;
 217  
 218      /**
 219       *    Constructor. Starts in named state.
 220       *    @param string $start        Starting state name.
 221       *    @access public
 222       */
 223      function AkLexerStateStack($start) {
 224          $this->_stack = array($start);
 225      }
 226  
 227      /**
 228       *    Accessor for current state.
 229       *    @return string       State.
 230       *    @access public
 231       */
 232      function getCurrent() {
 233          return $this->_stack[count($this->_stack) - 1];
 234      }
 235  
 236      /**
 237       *    Adds a state to the stack and sets it
 238       *    to be the current state.
 239       *    @param string $state        New state.
 240       *    @access public
 241       */
 242      function enter($state) {
 243          array_push($this->_stack, $state);
 244      }
 245  
 246      /**
 247       *    Leaves the current state and reverts
 248       *    to the previous one.
 249       *    @return boolean    False if we drop off
 250       *                       the bottom of the list.
 251       *    @access public
 252       */
 253      function leave() {
 254          if (count($this->_stack) == 1) {
 255              return false;
 256          }
 257          array_pop($this->_stack);
 258          return true;
 259      }
 260  }
 261  
 262  /**
 263   *    Accepts text and breaks it into tokens.
 264   *    Some optimisation to make the sure the
 265   *    content is only scanned by the PHP regex
 266   *    parser once. Lexer modes must not start
 267   *    with leading underscores.
 268   */
 269  class AkLexer {
 270      var $_regexes;
 271      var $_parser;
 272      var $_mode;
 273      var $_mode_handlers;
 274      var $_case;
 275  
 276      /**
 277       *    Sets up the lexer in case insensitive matching
 278       *    by default.
 279       *    @param AkParser $parser  Handling strategy by
 280       *                                    reference.
 281       *    @param string $start            Starting handler.
 282       *    @param boolean $case            True for case sensitive.
 283       *    @access public
 284       */
 285      function AkLexer(&$parser, $start = 'accept', $case = false) {
 286          $this->_case = $case;
 287          $this->_regexes = array();
 288          $this->_parser = &$parser;
 289          $this->_mode = &new AkLexerStateStack($start);
 290          $this->_mode_handlers = array();
 291      }
 292  
 293      /**
 294       *    Adds a token search pattern for a particular
 295       *    parsing mode. The pattern does not change the
 296       *    current mode.
 297       *    @param string $pattern      Perl style regex, but ( and )
 298       *                                lose the usual meaning.
 299       *    @param string $mode         Should only apply this
 300       *                                pattern when dealing with
 301       *                                this type of input.
 302       *    @access public
 303       */
 304      function addPattern($pattern, $mode = "accept") {
 305          if (! isset($this->_regexes[$mode])) {
 306              $this->_regexes[$mode] = new AkLexerParallelRegex($this->_case);
 307          }
 308          $this->_regexes[$mode]->addPattern($pattern);
 309      }
 310  
 311      /**
 312       *    Adds a pattern that will enter a new parsing
 313       *    mode. Useful for entering parenthesis, strings,
 314       *    tags, etc.
 315       *    @param string $pattern      Perl style regex, but ( and )
 316       *                                lose the usual meaning.
 317       *    @param string $mode         Should only apply this
 318       *                                pattern when dealing with
 319       *                                this type of input.
 320       *    @param string $new_mode     Change parsing to this new
 321       *                                nested mode.
 322       *    @access public
 323       */
 324      function addEntryPattern($pattern, $mode, $new_mode) {
 325          if (! isset($this->_regexes[$mode])) {
 326              $this->_regexes[$mode] = new AkLexerParallelRegex($this->_case);
 327          }
 328          $this->_regexes[$mode]->addPattern($pattern, $new_mode);
 329      }
 330  
 331      /**
 332       *    Adds a pattern that will exit the current mode
 333       *    and re-enter the previous one.
 334       *    @param string $pattern      Perl style regex, but ( and )
 335       *                                lose the usual meaning.
 336       *    @param string $mode         Mode to leave.
 337       *    @access public
 338       */
 339      function addExitPattern($pattern, $mode) {
 340          if (! isset($this->_regexes[$mode])) {
 341              $this->_regexes[$mode] = new AkLexerParallelRegex($this->_case);
 342          }
 343          $this->_regexes[$mode]->addPattern($pattern, '__exit');
 344      }
 345  
 346      /**
 347       *    Adds a pattern that has a special mode. Acts as an entry
 348       *    and exit pattern in one go, effectively calling a special
 349       *    parser handler for this token only.
 350       *    @param string $pattern      Perl style regex, but ( and )
 351       *                                lose the usual meaning.
 352       *    @param string $mode         Should only apply this
 353       *                                pattern when dealing with
 354       *                                this type of input.
 355       *    @param string $special      Use this mode for this one token.
 356       *    @access public
 357       */
 358      function addSpecialPattern($pattern, $mode, $special) {
 359          if (! isset($this->_regexes[$mode])) {
 360              $this->_regexes[$mode] = new AkLexerParallelRegex($this->_case);
 361          }
 362          $this->_regexes[$mode]->addPattern($pattern, "_$special");
 363      }
 364  
 365      /**
 366       *    Adds a mapping from a mode to another handler.
 367       *    @param string $mode        Mode to be remapped.
 368       *    @param string $handler     New target handler.
 369       *    @access public
 370       */
 371      function mapHandler($mode, $handler) {
 372          $this->_mode_handlers[$mode] = $handler;
 373      }
 374  
 375      /**
 376       *    Splits the page text into tokens. Will fail
 377       *    if the handlers report an error or if no
 378       *    content is consumed. If successful then each
 379       *    unparsed and parsed token invokes a call to the
 380       *    held listener.
 381       *    @param string $raw        Raw HTML text.
 382       *    @return boolean           True on success, else false.
 383       *    @access public
 384       */
 385      function parse($raw) {
 386          if (! isset($this->_parser)) {
 387              return false;
 388          }
 389  
 390          $initialLength = strlen($raw);
 391          $length = $initialLength;
 392          $pos = 0;
 393          while (is_array($parsed = $this->_reduce($raw))) {
 394              list($unmatched, $matched, $mode) = $parsed;
 395              $currentLength = strlen($raw);
 396              $matchPos = $initialLength - $currentLength - strlen($matched);
 397              if (! $this->_dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
 398                  return false;
 399              }
 400              if ($currentLength == $length) {
 401                  return false;
 402              }
 403              $length = $currentLength;
 404              $pos = $initialLength - $currentLength;
 405          }
 406          if (!$parsed) {
 407              return false;
 408          }
 409          return $this->_invokeParser($raw, AK_LEXER_UNMATCHED, $pos);
 410      }
 411  
 412      /**
 413       *    Sends the matched token and any leading unmatched
 414       *    text to the parser changing the lexer to a new
 415       *    mode if one is listed.
 416       *    @param string $unmatched    Unmatched leading portion.
 417       *    @param string $matched      Actual token match.
 418       *    @param string $mode         Mode after match. A boolean
 419       *                                false mode causes no change.
 420       *    @param int $pos         Current byte index location in raw doc
 421       *                                thats being parsed
 422       *    @return boolean             False if there was any error
 423       *                                from the parser.
 424       *    @access private
 425       */
 426      function _dispatchTokens($unmatched, $matched, $mode = false, $initialPos, $matchPos) {
 427          if (! $this->_invokeParser($unmatched, AK_LEXER_UNMATCHED, $initialPos) ){
 428              return false;
 429          }
 430          if ($this->_isModeEnd($mode)) {
 431              if (! $this->_invokeParser($matched, AK_LEXER_EXIT, $matchPos)) {
 432                  return false;
 433              }
 434              return $this->_mode->leave();
 435          }
 436          if ($this->_isSpecialMode($mode)) {
 437              $this->_mode->enter($this->_decodeSpecial($mode));
 438              if (! $this->_invokeParser($matched, AK_LEXER_SPECIAL, $matchPos)) {
 439                  return false;
 440              }
 441              return $this->_mode->leave();
 442          }
 443          if (is_string($mode)) {
 444              $this->_mode->enter($mode);
 445              return $this->_invokeParser($matched, AK_LEXER_ENTER, $matchPos);
 446          }
 447          return $this->_invokeParser($matched, AK_LEXER_MATCHED, $matchPos);
 448      }
 449  
 450      /**
 451       *    Tests to see if the new mode is actually to leave
 452       *    the current mode and pop an item from the matching
 453       *    mode stack.
 454       *    @param string $mode    Mode to test.
 455       *    @return boolean        True if this is the exit mode.
 456       *    @access private
 457       */
 458      function _isModeEnd($mode) {
 459          return ($mode === "__exit");
 460      }
 461  
 462      /**
 463       *    Test to see if the mode is one where this mode
 464       *    is entered for this token only and automatically
 465       *    leaves immediately afterwoods.
 466       *    @param string $mode    Mode to test.
 467       *    @return boolean        True if this is the exit mode.
 468       *    @access private
 469       */
 470      function _isSpecialMode($mode) {
 471          return (strncmp($mode, "_", 1) == 0);
 472      }
 473  
 474      /**
 475       *    Strips the magic underscore marking single token
 476       *    modes.
 477       *    @param string $mode    Mode to decode.
 478       *    @return string         Underlying mode name.
 479       *    @access private
 480       */
 481      function _decodeSpecial($mode) {
 482          return substr($mode, 1);
 483      }
 484  
 485      /**
 486       *    Calls the parser method named after the current
 487       *    mode. Empty content will be ignored. The lexer
 488       *    has a parser handler for each mode in the lexer.
 489       *    @param string $content        Text parsed.
 490       *    @param boolean $is_match      Token is recognised rather
 491       *                                  than unparsed data.
 492       *    @param int $pos         Current byte index location in raw doc
 493       *                                thats being parsed
 494       *    @access private
 495       */
 496      function _invokeParser($content, $is_match, $pos) {
 497          if (($content === '') || ($content === false)) {
 498              return true;
 499          }
 500          $handler = $this->_mode->getCurrent();
 501          if (isset($this->_mode_handlers[$handler])) {
 502              $handler = $this->_mode_handlers[$handler];
 503          }
 504          return $this->_parser->$handler($content, $is_match, $pos);
 505      }
 506  
 507      /**
 508       *    Tries to match a chunk of text and if successful
 509       *    removes the recognised chunk and any leading
 510       *    unparsed data. Empty strings will not be matched.
 511       *    @param string $raw         The subject to parse. This is the
 512       *                               content that will be eaten.
 513       *    @return array              Three item list of unparsed
 514       *                               content followed by the
 515       *                               recognised token and finally the
 516       *                               action the parser is to take.
 517       *                               True if no match, false if there
 518       *                               is a parsing error.
 519       *    @access private
 520       */
 521      function _reduce(&$raw) {
 522          if (! isset($this->_regexes[$this->_mode->getCurrent()])) {
 523              return false;
 524          }
 525          if ($raw === "") {
 526              return true;
 527          }
 528          if ($action = $this->_regexes[$this->_mode->getCurrent()]->split($raw, $split)) {
 529              list($unparsed, $match, $raw) = $split;
 530              return array($unparsed, $match, $action);
 531          }
 532          return true;
 533      }
 534  }
 535  
 536  
 537  ?>


Generated: Mon Oct 27 12:43:49 2008 Cross-referenced by PHPXref 0.6