pyparsley/parser.y

590 lines
14 KiB
Plaintext

%{
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "kstring.h"
#include <libxml/hash.h>
#ifndef PARSER_Y_H_INCLUDED
#define PARSER_Y_H_INCLUDED
#define YYSTYPE char *
static char* parsed_answer;
int yylex (void);
void yyerror (char const *);
void prepare_parse(char*);
void cleanup_parse(void);
void start_debugging(void);
static xmlHashTablePtr alias_hash;
char* xpath_alias(char*);
void init_xpath_alias();
int yyparse(void);
char* myparse(char*);
void answer(char*);
#endif
%}
%glr-parser
%token_table
%debug
%token NUMBER
%token S
%token AT
%token LPAREN
%token RPAREN
%token PIPE
%token LT
%token SLASH
%token DBLSLASH
%token BANG
%token COLON
%token DBLCOLON
%token QUERY
%token HASH
%token COMMA
%token DOT
%token DBLDOT
%token GT
%token LBRA
%token RBRA
%token TILDE
%token SPLAT
%token PLUS
%token DASH
%token EQ
%token LTE
%token GTE
%token DOLLAR
%token BSLASHLIT
%token OTHER
%token XANCESTOR
%token XANCESTORSELF
%token XATTR
%token XCHILD
%token XDESC
%token XDESCSELF
%token XFOLLOW
%token XFOLLOWSIB
%token XNS
%token XPARENT
%token XPRE
%token XPRESIB
%token XSELF
%token XOR
%token XAND
%token XDIV
%token XMOD
%token XCOMMENT
%token XTEXT
%token XPI
%token XNODE
%token CXEQUATION
%token CXOPHE
%token CXOPNE
%token CXOPSTARTEQ
%token CXOPENDEQ
%token CXOPCONTAINS
%token CXOPCONTAINS2
%token CXFIRST
%token CXLAST
%token CXNOT
%token CXEVEN
%token CXODD
%token CXEQ
%token CXGT
%token CXLT
%token CXHEADER
%token CXCONTAINS
%token CXEMPTY
%token CXHAS
%token CXPARENT
%token CXNTHCH
%token CXNTHLASTCH
%token CXNTHTYPE
%token CXNTHLASTTYPE
%token CXFIRSTCH
%token CXLASTCH
%token CXFIRSTTYPE
%token CXLASTTYPE
%token CXONLYCH
%token CXONLYTYPE
%token CXINPUT
%token CXTEXT
%token CXPASSWORD
%token CXRADIO
%token CXCHECKBOX
%token CXSUBMIT
%token CXIMAGE
%token CXRESET
%token CXBUTTON
%token CXFILE
%token CXENABLED
%token CXDISABLED
%token CXCHECKED
%token CXSELECTED
%token NAME
%token STRING
%%
Root
: Expr OptS { answer($1); }
;
LocationPath
: RelativeLocationPath %dprec 1
| AbsoluteLocationPath %dprec 1
| selectors_group %dprec 3
;
AbsoluteLocationPath
: SLASH RelativeLocationPath { $$ = astrcat($1, $2); }
| SLASH
| AbbreviatedAbsoluteLocationPath
;
RelativeLocationPath
: Step
| RelativeLocationPath SLASH Step { $$ = astrcat3($1, $2, $3); }
| AbbreviatedRelativeLocationPath
;
Step
: AxisSpecifier NodeTest { $$ = astrcat($1, $2); }
| AxisSpecifier NodeTest Predicate { $$ = astrcat3($1, $2, $3); }
| AbbreviatedStep
;
AxisSpecifier
: AxisName DBLCOLON { $$ = astrcat($1, $2); }
| AbbreviatedAxisSpecifier
;
AxisName
: XANCESTOR
| XANCESTORSELF
| XATTR
| XCHILD
| XDESC
| XDESCSELF
| XFOLLOW
| XFOLLOWSIB
| XNS
| XPARENT
| XPRE
| XPRESIB
| XSELF
;
NodeTest
: NameTest
| NodeType LPAREN RPAREN { $$ = astrcat3($1, $2, $3); }
| XPI LPAREN Literal RPAREN { $$ = astrcat4($1, $2, $3, $4); }
;
Predicate
: LBRA PredicateExpr RBRA { $$ = astrcat3($1, $2, $3); }
;
PredicateExpr
: Expr
;
AbbreviatedAbsoluteLocationPath
: DBLSLASH RelativeLocationPath { $$ = astrcat($1, $2); }
;
AbbreviatedRelativeLocationPath
: RelativeLocationPath DBLSLASH Step { $$ = astrcat3($1, $2, $3); }
;
AbbreviatedStep
: DOT
| DBLDOT
;
AbbreviatedAxisSpecifier
: AT
| { $$ = ""; }
;
Expr
: LPAREN Expr RPAREN %dprec 2 { $$ = $2; }
| OrExpr %dprec 1
;
PrimaryExpr
: VariableReference
| LPAREN Expr RPAREN { $$ = astrcat3($1, $2, $3); }
| Literal
| Number
| FunctionCall
;
FunctionCall
: FunctionName LPAREN Arguments RPAREN { $$ = astrcat4(xpath_alias($1), $2, $3, $4); }
;
Arguments
: ArgumentSet
| { $$ = ""; }
;
ArgumentSet
: Argument COMMA ArgumentSet %dprec 2 { $$ = astrcat3($1, $2, $3); }
| Argument %dprec 1
;
Argument
: OptS Expr OptS { $$ = $2; }
;
UnionExpr
: PathExpr
| UnionExpr PIPE PathExpr { $$ = astrcat3($1, $2, $3); }
;
PathExpr
: LocationPath
| FilterExpr
| FilterExpr SLASH RelativeLocationPath { $$ = astrcat3($1, $2, $3); }
| FilterExpr DBLSLASH RelativeLocationPath { $$ = astrcat3($1, $2, $3); }
;
FilterExpr
: PrimaryExpr
| FilterExpr Predicate { $$ = astrcat($1, $2); }
;
OrExpr
: AndExpr
| OrExpr XOR AndExpr { $$ = astrcat3($1, $2, $3); }
;
AndExpr
: EqualityExpr
| AndExpr XAND EqualityExpr { $$ = astrcat3($1, $2, $3); }
;
EqualityExpr
: RelationalExpr
| EqualityExpr EQ RelationalExpr { $$ = astrcat3($1, $2, $3); }
| EqualityExpr CXOPNE RelationalExpr { $$ = astrcat3($1, $2, $3); }
;
RelationalExpr
: AdditiveExpr
| RelationalExpr LT AdditiveExpr { $$ = astrcat3($1, $2, $3); }
| RelationalExpr GT AdditiveExpr { $$ = astrcat3($1, $2, $3); }
| RelationalExpr LTE AdditiveExpr { $$ = astrcat3($1, $2, $3); }
| RelationalExpr GTE AdditiveExpr { $$ = astrcat3($1, $2, $3); }
;
AdditiveExpr
: MultiplicativeExpr
| AdditiveExpr PLUS MultiplicativeExpr { $$ = astrcat3($1, $2, $3); }
| AdditiveExpr DASH MultiplicativeExpr { $$ = astrcat3($1, $2, $3); }
;
MultiplicativeExpr
: UnaryExpr
| MultiplicativeExpr MultiplyOperator UnaryExpr { $$ = astrcat3($1, $2, $3); }
| MultiplicativeExpr XDIV UnaryExpr { $$ = astrcat3($1, $2, $3); }
| MultiplicativeExpr XMOD UnaryExpr { $$ = astrcat3($1, $2, $3); }
;
UnaryExpr
: UnionExpr
| DASH UnaryExpr { $$ = astrcat($1, $2); }
;
ExprToken
: LPAREN
| RPAREN
| LBRA
| RBRA
| DOT
| DBLDOT
| AT
| COMMA
| DBLCOLON
| NameTest
| NodeType
| Operator
| FunctionName
| AxisName
| Literal
| Number
| VariableReference
;
Literal
: STRING
;
Number
: NUMBER
| NUMBER DOT { $$ = astrcat($1, $2); }
| NUMBER DOT NUMBER { $$ = astrcat3($1, $2, $3); }
| DOT NUMBER { $$ = astrcat($1, $2); }
;
Operator
: OperatorName
| MultiplyOperator
| SLASH
| DBLSLASH
| PIPE
| PLUS
| DASH
| EQ
| CXOPNE
| LT
| LTE
| GT
| GTE
;
OperatorName
: XAND
| XOR
| XMOD
| XDIV
;
MultiplyOperator
: SPLAT
;
VariableReference
: DOLLAR QName { $$ = astrcat($1, $2); }
;
NameTest
: SPLAT
| NCName COLON SPLAT { $$ = astrcat3($1, $2, $3); }
| QName
;
NodeType
: XCOMMENT
| XTEXT
| XPI
| XNODE
;
ExprWhitespace
: S
FunctionName
: QName
;
QName
: PrefixedName
| UnprefixedName
;
PrefixedName
: Prefix COLON LocalPart { $$ = astrcat3($1, $2, $3); }
;
UnprefixedName
: LocalPart
;
Prefix
: NCName
;
LocalPart
: NCName
;
NCName
: NAME
;
selectors_group
: attribute_extended_selector COMMA OptS selectors_group { $$ = astrcat4(".//", $1, "|", $4); }
| attribute_extended_selector { $$ = astrcat(".//", $1); }
;
attribute_extended_selector
: selector
| selector S AT NAME { $$ = astrcat3($1, "/@", $4); }
;
selector
: simple_selector_sequence combinator selector { $$ = astrcat3($1, $2, $3); }
| simple_selector_sequence
;
combinator
: PLUS OptS { $$ = "/following-sibling::*[1]/self::"; }
| GT OptS { $$ = "/"; }
| TILDE OptS { $$ = "/following-sibling::*/self::"; }
| S { $$ = "//"; }
;
simple_selector_sequence
: simple_selector_anchor
| possibly_empty_sequence HASH Ident { $$ = astrcat4($1, "[@id='", $3,"']"); }
| possibly_empty_sequence DOT Ident { $$ = astrcat4($1, "[contains(concat( ' ', @class, ' ' ), concat( ' ', '", $3, "', ' ' ))]"); }
| possibly_empty_sequence LBRA type_selector RBRA { $$ = astrcat4($1, "[@", $3, "]"); }
| possibly_empty_sequence LBRA type_selector OptS EQ OptS StringLike OptS RBRA { $$ = astrcat6($1, "[@", $3, " = ", $7, "]"); }
| possibly_empty_sequence LBRA type_selector OptS CXOPHE OptS StringLike OptS RBRA { $$ = astrcat10($1, "[@", $3, " = ", $7, " or starts-with(@", $3, ", concat(", $7, ", '-' ))]"); }
| possibly_empty_sequence LBRA type_selector OptS CXOPNE OptS StringLike OptS RBRA { $$ = astrcat6($1, "[@", $3, " != ", $7, "]"); }
| possibly_empty_sequence LBRA type_selector OptS CXOPSTARTEQ OptS StringLike OptS RBRA { $$ = astrcat6($1, "[starts-with(@", $3, ", ", $7, ")]"); }
| possibly_empty_sequence LBRA type_selector OptS CXOPENDEQ OptS StringLike OptS RBRA { $$ = astrcat6($1, "[ends-with(@", $3, ", ", $7, ")]"); }
| possibly_empty_sequence LBRA type_selector OptS CXOPCONTAINS OptS StringLike OptS RBRA { $$ = astrcat6($1, "[contains(@", $3, ", ", $7, ")]"); }
| possibly_empty_sequence LBRA type_selector OptS CXOPCONTAINS2 OptS StringLike OptS RBRA { $$ = astrcat6($1, "[contains(@", $3, ", ", $7, ")]"); }
| possibly_empty_sequence CXFIRST { $$ = astrcat($1, "[1]"); }
| possibly_empty_sequence CXLAST { $$ = astrcat($1, "[last()]"); }
| possibly_empty_sequence CXNOT LPAREN selectors_group RPAREN { $$ = astrcat5("set-difference(", $1, ", ", $4, ")"); }
| possibly_empty_sequence CXEVEN { $$ = astrcat($1, "[position() % 2 = 0]"); }
| possibly_empty_sequence CXODD { $$ = astrcat($1, "[position() % 2 = 1]"); }
| possibly_empty_sequence CXEQ LPAREN NumberLike RPAREN { $$ = astrcat4($1, "[position() = ", $4, "]"); }
| possibly_empty_sequence CXGT LPAREN NumberLike RPAREN { $$ = astrcat4($1, "[position() > ", $4, "]"); }
| possibly_empty_sequence CXLT LPAREN NumberLike RPAREN { $$ = astrcat4($1, "[position() < ", $4, "]"); }
| possibly_empty_sequence CXHEADER { $$ = astrcat($1, "[contains('h1 h2 h3 h4 h5 h6', lower-case(local-name()))]"); }
| possibly_empty_sequence CXCONTAINS LPAREN StringLike RPAREN { $$ = astrcat4($1, "[contains(., ", $4, "]"); }
| possibly_empty_sequence CXEMPTY { $$ = astrcat($1, "[not(node())]"); }
| possibly_empty_sequence CXHAS LPAREN selectors_group RPAREN { $$ = astrcat4($1, "[", $4, "]"); }
| possibly_empty_sequence CXPARENT { $$ = astrcat($1, "[node()]"); }
| possibly_empty_sequence CXNTHCH LPAREN NumberLike RPAREN { $$ = astrcat4("*[", $4, "]/self::", $1); }
| possibly_empty_sequence CXNTHLASTCH LPAREN NumberLike RPAREN { $$ = astrcat4("*[last() - ", $4, "]/self::", $1); }
| possibly_empty_sequence CXNTHTYPE LPAREN NumberLike RPAREN { $$ = astrcat4($1, "[position() = ", $4, "]"); }
| possibly_empty_sequence CXNTHLASTTYPE LPAREN NumberLike RPAREN { $$ = astrcat4($1, "[position() = last() - ", $4, "]"); }
| possibly_empty_sequence CXFIRSTCH { $$ = astrcat("*[1]/self::", $1); }
| possibly_empty_sequence CXLASTCH { $$ = astrcat("*[last()]/self::", $1); }
| possibly_empty_sequence CXFIRSTTYPE { $$ = astrcat($1, "[1]"); }
| possibly_empty_sequence CXLASTTYPE { $$ = astrcat($1, "[last()]"); }
| possibly_empty_sequence CXONLYCH { $$ = astrcat("*[count()=1]/self::", $1); }
| possibly_empty_sequence CXONLYTYPE { $$ = astrcat($1, "[count()=1]"); }
| possibly_empty_sequence CXINPUT { $$ = astrcat($1, "[lower-case(name)='input']"); }
| possibly_empty_sequence CXTEXT { $$ = astrcat($1, "[lower-case(name)='input' and lower-case(@type)='text']"); }
| possibly_empty_sequence CXPASSWORD { $$ = astrcat($1, "[lower-case(name)='input' and lower-case(@type)='password']"); }
| possibly_empty_sequence CXRADIO { $$ = astrcat($1, "[lower-case(name)='input' and lower-case(@type)='radio']"); }
| possibly_empty_sequence CXCHECKBOX { $$ = astrcat($1, "[lower-case(name)='input' and lower-case(@type)='checkbox']"); }
| possibly_empty_sequence CXSUBMIT { $$ = astrcat($1, "[lower-case(name)='input' and lower-case(@type)='submit']"); }
| possibly_empty_sequence CXIMAGE { $$ = astrcat($1, "[lower-case(name)='input' and lower-case(@type)='image']"); }
| possibly_empty_sequence CXRESET { $$ = astrcat($1, "[lower-case(name)='input' and lower-case(@type)='reset']"); }
| possibly_empty_sequence CXBUTTON { $$ = astrcat($1, "[lower-case(name)='input' and lower-case(@type)='button']"); }
| possibly_empty_sequence CXFILE { $$ = astrcat($1, "[lower-case(name)='input' and lower-case(@type)='file']"); }
| possibly_empty_sequence CXENABLED { $$ = astrcat($1, "[lower-case(name)='input' and not(@disabled)]"); }
| possibly_empty_sequence CXDISABLED { $$ = astrcat($1, "[lower-case(name)='input' and @disabled]"); }
| possibly_empty_sequence CXCHECKED { $$ = astrcat($1, "[@checked]"); }
| possibly_empty_sequence CXSELECTED { $$ = astrcat($1, "[@selected]"); }
;
possibly_empty_sequence
: simple_selector_sequence
| { $$ = "*"; }
;
simple_selector_anchor
: type_selector
| universal
;
type_selector
: namespace_prefix element_name { $$ = astrcat3($1, ":", $2); }
| element_name
;
namespace_prefix
: SPLAT PIPE { $$ = "*"; }
| Ident PIPE { $$ = $1; }
| PIPE { $$ = "*"; }
;
element_name
: Ident
;
universal
: namespace_prefix SPLAT { $$ = astrcat3($1, ":", $2); }
| SPLAT
;
NumberLike
: NUMBER
;
Ident
: NAME
| BSLASHLIT { *$$ = *astrdup($1) + 1; }
| NAME Ident { $$ = strcat($1, $2); }
| BSLASHLIT Ident { *$$ = (*astrcat($1, $2) + 1); }
| keyword
;
keyword
: XANCESTOR
| XANCESTORSELF
| XATTR
| XCHILD
| XDESC
| XDESCSELF
| XFOLLOW
| XFOLLOWSIB
| XNS
| XPARENT
| XPRE
| XPRESIB
| XSELF
| XOR
| XAND
| XDIV
| XMOD
| XCOMMENT
| XTEXT
| XPI
| XNODE
;
StringLike
: Ident { $$ = astrcat3("'", $1, "'"); }
| STRING
;
OptS
: S { $$ = " "; }
| { $$ = ""; }
;
%%
char* xpath_alias(char* key) {
char* value = (char*) xmlHashLookup(alias_hash, key);
return value == NULL ? key : value;
}
void init_xpath_alias() {
alias_hash = xmlHashCreate(100);
xmlHashAddEntry(alias_hash, "html", "lib:html-document");
xmlHashAddEntry(alias_hash, "match", "regexp:match");
xmlHashAddEntry(alias_hash, "replace", "regexp:replace");
xmlHashAddEntry(alias_hash, "test", "regexp:test");
xmlHashAddEntry(alias_hash, "with-newlines", "lib:nl");
}
char* myparse(char* string){
// start_debugging();
prepare_parse(string);
yyparse();
cleanup_parse();
return parsed_answer;
}
void answer(char* a){
parsed_answer = a;
}
void start_debugging(){
yydebug = 1;
return;
}