View Javadoc

1   /*
2    * $Header: /home/projects/jaxen/scm/jaxen/src/java/main/org/jaxen/saxpath/base/XPathLexer.java,v 1.12 2005/08/09 15:16:40 elharo Exp $
3    * $Revision: 1.12 $
4    * $Date: 2005/08/09 15:16:40 $
5    *
6    * ====================================================================
7    *
8    * Copyright (C) 2000-2002 bob mcwhirter & James Strachan.
9    * All rights reserved.
10   *
11   * Redistribution and use in source and binary forms, with or without
12   * modification, are permitted provided that the following conditions
13   * are met:
14   *
15   * 1. Redistributions of source code must retain the above copyright
16   *    notice, this list of conditions, and the following disclaimer.
17   *
18   * 2. Redistributions in binary form must reproduce the above copyright
19   *    notice, this list of conditions, and the disclaimer that follows
20   *    these conditions in the documentation and/or other materials
21   *    provided with the distribution.
22   *
23   * 3. The name "Jaxen" must not be used to endorse or promote products
24   *    derived from this software without prior written permission.  For
25   *    written permission, please contact license@jaxen.org.
26   *
27   * 4. Products derived from this software may not be called "Jaxen", nor
28   *    may "Jaxen" appear in their name, without prior written permission
29   *    from the Jaxen Project Management (pm@jaxen.org).
30   *
31   * In addition, we request (but do not require) that you include in the
32   * end-user documentation provided with the redistribution and/or in the
33   * software itself an acknowledgement equivalent to the following:
34   *     "This product includes software developed by the
35   *      Jaxen Project <http://www.jaxen.org/>."
36   * Alternatively, the acknowledgment may be graphical using the logos
37   * available at http://www.jaxen.org/
38   *
39   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
40   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
41   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
42   * DISCLAIMED.  IN NO EVENT SHALL THE Jaxen AUTHORS OR THE PROJECT
43   * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
46   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
47   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
48   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
49   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50   * SUCH DAMAGE.
51   *
52   * ====================================================================
53   * This software consists of voluntary contributions made by many
54   * individuals on behalf of the Jaxen Project and was originally
55   * created by bob mcwhirter <bob@werken.com> and
56   * James Strachan <jstrachan@apache.org>.  For more information on the
57   * Jaxen Project, please see <http://www.jaxen.org/>.
58   *
59   * $Id: XPathLexer.java,v 1.12 2005/08/09 15:16:40 elharo Exp $
60   */
61  
62  
63  
64  
65  package org.jaxen.saxpath.base;
66  
67  class XPathLexer
68  {
69      private String xpath;
70      private int    currentPosition;
71      private int    endPosition;
72  
73      private Token  previousToken;
74  
75      XPathLexer(String xpath)
76      {
77          setXPath( xpath );
78      }
79  
80      private void setXPath(String xpath)
81      {
82          this.xpath           = xpath;
83          this.currentPosition = 0;
84          this.endPosition     = xpath.length();
85      }
86  
87      String getXPath()
88      {
89          return this.xpath;
90      }
91  
92      Token nextToken()
93      {
94          Token token = null;
95  
96          do
97          {
98              token = null;
99  
100             switch ( LA(1) )
101             {
102                 case '$':
103                 {
104                     token = dollar();
105                     break;
106                 }
107                     
108                 case '"':
109                 case '\'':
110                 {
111                     token = literal();
112                     break;
113                 }
114                     
115                 case '/':
116                 {
117                     token = slashes();
118                     break;
119                 }
120 
121                 case ',':
122                 {
123                     token = comma();
124                     break;
125                 }
126                     
127                 case '(':
128                 {
129                     token = leftParen();
130                     break;
131                 }
132                     
133                 case ')':
134                 {
135                     token = rightParen();
136                     break;
137                 }
138                     
139                 case '[':
140                 {
141                     token = leftBracket();
142                     break;
143                 }
144                     
145                 case ']':
146                 {
147                     token = rightBracket();
148                     break;
149                 }
150                     
151                 case '+':
152                 {
153                     token = plus();
154                     break;
155                 }
156                     
157                 case '-':
158                 {
159                     token = minus();
160                     break;
161                 }
162                     
163                 case '<':
164                 case '>':
165                 {
166                     token = relationalOperator();
167                     break;
168                 }        
169 
170                 case '=':
171                 {
172                     token = equals();
173                     break;
174                 }
175                     
176                 case '!':
177                 {
178                     if ( LA(2) == '=' )
179                     {
180                         token = notEquals();
181                     }
182                     else
183                     {
184                         token = not();
185                     }
186                     break;
187                 }
188                     
189                 case '|':
190                 {
191                     token = pipe();
192                     break;
193                 }
194                     
195                 case '@':
196                 {
197                     token = at();
198                     break;
199                 }
200                     
201                 case ':':
202                 {
203                     if ( LA(2) == ':' )
204                     {
205                         token = doubleColon();
206                     }
207                     else
208                     {
209                         token = colon();
210                     }
211                     break;
212                 }
213                     
214                 case '*':
215                 {
216                     token = star();
217                     break;
218                 }
219                     
220                 case '.':
221                 {
222                     switch ( LA(2) )
223                     {
224                         case '0':
225                         case '1':
226                         case '2':
227                         case '3':
228                         case '4':
229                         case '5':
230                         case '6':
231                         case '7':
232                         case '8':
233                         case '9':
234                         {
235                             token = number();
236                             break;
237                         }
238                         default:
239                         {
240                             token = dots();
241                             break;
242                         }
243                     }
244                     break;
245                 }
246 
247                 case '0':
248                 case '1':
249                 case '2':
250                 case '3':
251                 case '4':
252                 case '5':
253                 case '6':
254                 case '7':
255                 case '8':
256                 case '9':
257                 {
258                     token = number();
259                     break;
260                 }
261 
262                 case ' ':
263                 case '\t':
264                 case '\n':
265                 case '\r':
266                 {
267                     token = whitespace();
268                     break;
269                 }
270                     
271                 default:
272                 {
273                     if ( isIdentifierStartChar( LA(1) ) )
274                     {
275                         token = identifierOrOperatorName();
276                     }
277                 }
278             }
279 
280             if ( token == null )
281             {
282                 if (!hasMoreChars())
283                 {
284                     token = new Token( TokenTypes.EOF,
285                                    getXPath(),
286                                    currentPosition(),
287                                    endPosition() );
288             }
289                 else
290                 {
291                     token = new Token( TokenTypes.ERROR,
292                                    getXPath(),
293                                    currentPosition(),
294                                    endPosition() );
295                 }
296             }
297 
298         }
299         while ( token.getTokenType() == TokenTypes.SKIP );
300 
301         setPreviousToken( token );
302         
303         return token;
304     }
305 
306     private Token identifierOrOperatorName()
307     {
308         Token token = null;
309     
310         if ( previousToken != null )
311         {
312             // For some reason, section 3.7, Lexical structure,
313             // doesn't seem to feel like it needs to mention the
314             // SLASH, DOUBLE_SLASH, and COLON tokens for the test
315             // if an NCName is an operator or not.
316             //
317             // According to section 3.7, "/foo" should be considered
318             // as a SLASH following by an OperatorName being 'foo'.
319             // Which is just simply, clearly, wrong, in my mind.
320             //
321             //     -bob
322             
323             switch ( previousToken.getTokenType() )
324             {
325                 case TokenTypes.AT:
326                 case TokenTypes.DOUBLE_COLON:
327                 case TokenTypes.LEFT_PAREN:
328                 case TokenTypes.LEFT_BRACKET:
329                 case TokenTypes.AND:
330                 case TokenTypes.OR:
331                 case TokenTypes.MOD:
332                 case TokenTypes.DIV:
333                 case TokenTypes.COLON:
334                 case TokenTypes.SLASH:
335                 case TokenTypes.DOUBLE_SLASH:
336                 case TokenTypes.PIPE:
337                 case TokenTypes.DOLLAR:
338                 case TokenTypes.PLUS:
339                 case TokenTypes.MINUS:
340                 case TokenTypes.STAR:
341                 case TokenTypes.COMMA:
342                 case TokenTypes.LESS_THAN_SIGN:
343                 case TokenTypes.GREATER_THAN_SIGN:
344                 case TokenTypes.LESS_THAN_OR_EQUALS_SIGN:
345                 case TokenTypes.GREATER_THAN_OR_EQUALS_SIGN:
346                 case TokenTypes.EQUALS:
347                 case TokenTypes.NOT_EQUALS:
348                 {
349                     token = identifier();
350                     break;
351                 }
352                 default:
353                 {
354                     token = operatorName();
355                     break;
356                 }
357             }
358         }
359         else
360         {
361             token = identifier();
362         }
363     
364         return token;
365     }
366     
367     private Token identifier()
368     {
369         Token token = null;
370     
371         int start = currentPosition();
372     
373         while ( hasMoreChars() )
374         {
375             if ( isIdentifierChar( LA(1) ) )
376             {
377                 consume();
378             }
379             else
380             {
381                 break;
382             }
383         }
384     
385         token = new Token( TokenTypes.IDENTIFIER,
386                            getXPath(),
387                            start,
388                            currentPosition() );
389     
390         return token;
391     }
392     
393     private Token operatorName()
394     {
395         Token token = null;
396     
397         switch ( LA(1) )
398         {
399             case 'a':
400             {
401                 token = and();
402                 break;
403             }
404     
405             case 'o':
406             {
407                 token = or();
408                 break;
409             }
410     
411             case 'm':
412             {
413                 token = mod();
414                 break;
415             }
416     
417             case 'd':
418             {
419                 token = div();
420                 break;
421             }
422         }
423     
424         return token;
425     }
426     
427     private Token mod()
428     {
429         Token token = null;
430     
431         if ( ( LA(1) == 'm' )
432              &&
433              ( LA(2) == 'o' )
434              &&
435              ( LA(3) == 'd' )
436            )
437         {
438             token = new Token( TokenTypes.MOD,
439                                getXPath(),
440                                currentPosition(),
441                                currentPosition()+3 );
442     
443             consume();
444             consume();
445             consume();
446         }
447     
448         return token;
449     }
450     
451     private Token div()
452     {
453         Token token = null;
454     
455         if ( ( LA(1) == 'd' )
456              &&
457              ( LA(2) == 'i' )
458              &&
459              ( LA(3) == 'v' )
460             )
461         {
462             token = new Token( TokenTypes.DIV,
463                                getXPath(),
464                                currentPosition(),
465                                currentPosition()+3 );
466     
467             consume();
468             consume();
469             consume();
470         }
471     
472         return token;
473     }
474     
475     private Token and()
476     {
477         Token token = null;
478     
479         if ( ( LA(1) == 'a' )
480              &&
481              ( LA(2) == 'n' )
482              &&
483              ( LA(3) == 'd' )
484            )
485         {
486             token = new Token( TokenTypes.AND,
487                                getXPath(),
488                                currentPosition(),
489                                currentPosition()+3 );
490     
491             consume();
492             consume();
493             consume();
494         }
495     
496         return token;
497     }
498     
499     private Token or()
500     {
501         Token token = null;
502     
503         if ( ( LA(1) == 'o' )
504              &&
505              ( LA(2) == 'r' )
506            )
507         {
508             token = new Token( TokenTypes.OR,
509                                getXPath(),
510                                currentPosition(),
511                                currentPosition()+2 );
512     
513             consume();
514             consume();
515         }
516     
517         return token;
518     }
519     
520     private Token number()
521     {
522         int     start         = currentPosition();
523         boolean periodAllowed = true;
524     
525       loop:
526         while( true )
527         {
528             switch ( LA(1) )
529             {
530                 case '.':
531                 {
532                     if ( periodAllowed )
533                     {
534                         periodAllowed = false;
535                         consume();
536                     }
537                     else
538                     {
539                         break loop;
540                     }
541                     break;
542                 }
543                 
544                 case '0':
545                 case '1':
546                 case '2':
547                 case '3':
548                 case '4':
549                 case '5':
550                 case '6':
551                 case '7':
552                 case '8':
553                 case '9':
554                 {
555                     consume();
556                     break;
557                 }
558                 default:
559                 {
560                     break loop;
561                 }
562             }
563         }
564     
565         Token token = null;
566     
567         if ( periodAllowed )
568         {
569             token = new Token( TokenTypes.INTEGER,
570                                getXPath(),
571                                start,
572                                currentPosition() );
573         }
574         else
575         {
576             token = new Token( TokenTypes.DOUBLE,
577                                getXPath(),
578                                start,
579                                currentPosition() );
580         }
581     
582         return token;
583     }
584     
585     private Token whitespace()
586     {
587         consume();
588             
589       loop:
590         while( hasMoreChars() )
591         {
592             switch ( LA(1) )
593             {
594                 case ' ':
595                 case '\t':
596                 case '\n':
597                 case '\r':
598                 {
599                     consume();
600                     break;
601                 }
602                     
603                 default:
604                 {
605                     break loop;
606                 }
607             }
608         }
609     
610         return new Token( TokenTypes.SKIP,
611                           getXPath(),
612                           0,
613                           0 );
614     }
615     
616     private Token comma()
617     {
618         Token token = new Token( TokenTypes.COMMA,
619                                  getXPath(),
620                                  currentPosition(),
621                                  currentPosition()+1 );
622     
623         consume();
624     
625         return token;
626     }
627     
628     private Token equals()
629     {
630         Token token = new Token( TokenTypes.EQUALS,
631                                  getXPath(),
632                                  currentPosition(),
633                                  currentPosition()+1 );
634     
635         consume();
636     
637         return token;
638     }
639     
640     private Token minus()
641     {
642         Token token = new Token( TokenTypes.MINUS,
643                                  getXPath(),
644                                  currentPosition(),
645                                  currentPosition()+1 );
646         consume();
647             
648         return token;
649     }
650     
651     private Token plus()
652     {
653         Token token = new Token( TokenTypes.PLUS,
654                                  getXPath(),
655                                  currentPosition(),
656                                  currentPosition()+1 );
657         consume();
658     
659         return token;
660     }
661     
662     private Token dollar()
663     {
664         Token token = new Token( TokenTypes.DOLLAR,
665                                  getXPath(),
666                                  currentPosition(),
667                                  currentPosition()+1 );
668         consume();
669     
670         return token;
671     }
672     
673     private Token pipe()
674     {
675         Token token = new Token( TokenTypes.PIPE,
676                                  getXPath(),
677                                  currentPosition(),
678                                  currentPosition()+1 );
679     
680         consume();
681     
682         return token;
683     }
684     
685     private Token at()
686     {
687         Token token = new Token( TokenTypes.AT,
688                                  getXPath(),
689                                  currentPosition(),
690                                  currentPosition()+1 );
691     
692         consume();
693     
694         return token;
695     }
696     
697     private Token colon()
698     {
699         Token token = new Token( TokenTypes.COLON,
700                                  getXPath(),
701                                  currentPosition(),
702                                  currentPosition()+1 );
703         consume();
704     
705         return token;
706     }
707     
708     private Token doubleColon()
709     {
710         Token token = new Token( TokenTypes.DOUBLE_COLON,
711                                  getXPath(),
712                                  currentPosition(),
713                                  currentPosition()+2 );
714     
715         consume();
716         consume();
717     
718         return token;
719     }
720     
721     private Token not()
722     {
723         Token token = new Token( TokenTypes.NOT,
724                                  getXPath(),
725                                  currentPosition(),
726                                  currentPosition() + 1 );
727     
728         consume();
729     
730         return token;
731     }
732     
733     private Token notEquals()
734     {
735         Token token = new Token( TokenTypes.NOT_EQUALS,
736                                  getXPath(),
737                                  currentPosition(),
738                                  currentPosition() + 2 );
739     
740         consume();
741         consume();
742     
743         return token;
744     }
745     
746     private Token relationalOperator()
747     {
748         Token token = null;
749     
750         switch ( LA(1) )
751         {
752             case '<':
753             {
754                 if ( LA(2) == '=' )
755                 {
756                     token = new Token( TokenTypes.LESS_THAN_OR_EQUALS_SIGN,
757                                        getXPath(),
758                                        currentPosition(),
759                                        currentPosition() + 2 );
760                     consume();
761                 }
762                 else
763                 {
764                     token = new Token( TokenTypes.LESS_THAN_SIGN,
765                                        getXPath(),
766                                        currentPosition(),
767                                        currentPosition() + 1);
768                 }
769     
770                 consume();
771                 break;
772             }
773             case '>':
774             {
775                 if ( LA(2) == '=' )
776                 {
777                     token = new Token( TokenTypes.GREATER_THAN_OR_EQUALS_SIGN,
778                                        getXPath(),
779                                        currentPosition(),
780                                        currentPosition() + 2 );
781                     consume();
782                 }
783                 else
784                 {
785                     token = new Token( TokenTypes.GREATER_THAN_SIGN,
786                                        getXPath(),
787                                        currentPosition(),
788                                        currentPosition() + 1 );
789                 }
790     
791                 consume();
792                 break;
793             }
794         }
795     
796         return token;
797                 
798     }
799     
800     private Token star()
801     {
802         Token token = new Token( TokenTypes.STAR,
803                                  getXPath(),
804                                  currentPosition(),
805                                  currentPosition()+1 );
806     
807         consume();
808             
809         return token;
810     }
811     
812     private Token literal()
813     {
814         Token token = null;
815     
816         char match  = LA(1);
817     
818         consume();
819     
820         int start = currentPosition();
821             
822         while ( ( token == null )
823                 &&
824                 hasMoreChars() )
825         {
826             if ( LA(1) == match )
827             {
828                 token = new Token( TokenTypes.LITERAL,
829                                    getXPath(),
830                                    start,
831                                    currentPosition() );
832             }
833             consume();
834         }
835     
836         return token;
837     }
838     
839     private Token dots()
840     {
841         Token token = null;
842     
843         switch ( LA(2) )
844         {
845             case '.':
846             {
847                 token = new Token( TokenTypes.DOT_DOT,
848                                    getXPath(),
849                                    currentPosition(),
850                                    currentPosition()+2 ) ;
851                 consume();
852                 consume();
853                 break;
854             }
855             default:
856             {
857                 token = new Token( TokenTypes.DOT,
858                                    getXPath(),
859                                    currentPosition(),
860                                    currentPosition()+1 );
861                 consume();
862                 break;
863             }
864         }
865     
866         return token;
867     }
868     
869     private Token leftBracket()
870     {
871         Token token = new Token( TokenTypes.LEFT_BRACKET,
872                                  getXPath(),
873                                  currentPosition(),
874                                  currentPosition()+1 );
875     
876         consume();
877     
878         return token;
879     }
880     
881     private Token rightBracket()
882     {
883         Token token = new Token( TokenTypes.RIGHT_BRACKET,
884                                  getXPath(),
885                                  currentPosition(),
886                                  currentPosition()+1 );
887     
888         consume();
889     
890         return token;
891     }
892     
893     private Token leftParen()
894     {
895         Token token = new Token( TokenTypes.LEFT_PAREN,
896                                  getXPath(),
897                                  currentPosition(),
898                                  currentPosition()+1 );
899     
900         consume();
901     
902         return token;
903     }
904     
905     private Token rightParen()
906     {
907         Token token = new Token( TokenTypes.RIGHT_PAREN,
908                                  getXPath(),
909                                  currentPosition(),
910                                  currentPosition()+1 );
911     
912         consume();
913     
914         return token;
915     }
916     
917     private Token slashes()
918     {
919         Token token = null;
920     
921         switch ( LA(2) )
922         {
923             case '/':
924             {
925                 token = new Token( TokenTypes.DOUBLE_SLASH,
926                                    getXPath(),
927                                    currentPosition(),
928                                    currentPosition()+2 );
929                 consume();
930                 consume();
931                 break;
932             }
933             default:
934             {
935                 token = new Token( TokenTypes.SLASH,
936                                    getXPath(),
937                                    currentPosition(),
938                                    currentPosition()+1 );
939                 consume();
940             }
941         }
942     
943         return token;
944     }
945     
946     private char LA(int i) 
947     {
948         if ( currentPosition + ( i - 1 ) >= endPosition() )
949         {
950             return (char) -1;
951         }
952     
953         return getXPath().charAt( currentPosition() + (i - 1) );
954     }
955     
956     private void consume()
957     {
958         ++this.currentPosition;
959     }
960     
961     private int currentPosition()
962     {
963         return this.currentPosition;
964     }
965     
966     private int endPosition()
967     {
968         return this.endPosition;
969     }
970     
971     private void setPreviousToken(Token previousToken)
972     {
973         this.previousToken = previousToken;
974     }
975     
976     private boolean hasMoreChars()
977     {
978         return currentPosition() < endPosition();
979     }
980     
981     private boolean isIdentifierChar(char c)
982     {
983         return Verifier.isXMLNCNameCharacter( c );
984     }
985     
986     private boolean isIdentifierStartChar(char c)
987     {
988         return Verifier.isXMLNCNameStartCharacter( c );
989     }
990 
991 }