tentative rename, etc

This commit is contained in:
Kyle Maxwell 2009-03-03 21:31:24 -08:00
parent 70a7c52d92
commit 712f3a64e3
34 changed files with 387 additions and 425 deletions

12
INSTALL
View File

@ -1,4 +1,4 @@
Dexter depends on
Parsley depends on
- the JSON C library from http://oss.metaparadigm.com/json-c/ (I used 0.7)
- argp (standard with Linux, other platforms use argp-standalone package)
- pcre (with dev headers)
@ -32,17 +32,11 @@ sudo make install
Ruby Binding (via Gems)
------------------------------------------------------------------------
# install the C version first
cd ruby
gem build dexterous.gemspec
sudo gem install dexterous
http://github.com/fizx/parsley-ruby
Python Binding
------------------------------------------------------------------------
# install the C version first
# Use Python 2.6, as this depends on the json support in Python's stdlib
cd python
python setup.py install
http://github.com/fizx/pyparsley
Other OS/Configurations:
------------------------------------------------------------------------

8
INTRO
View File

@ -1,6 +1,6 @@
<html><textarea style="width:100%;height:100%">
Towards a universal scraping API
or, an introduction to dexter
or, an introduction to parsley
Web scraping is a chore. Scraper scripts are brittle and slow, and everyone writes their own custom implementation, resulting in countless hours of repeated work. Let's work together to make it easier. Let's do what regular expressions did for text processing, and what SQL did for databases. Let's create a universal domain-specific language for web scraping.
@ -47,8 +47,8 @@ Applying this to http://www.yelp.com/biz/amnesia-san-francisco yields:
You'll note that the output structure mirrors the input structure. In the Ruby binding, you can get both input and output natively:
> require "open-uri"
> require "dexter"
> Dexterous.new({"title" => "h1", "links" => ["a"]}).parse(:url => "http://www.yelp.com/biz/amnesia-san-francisco")
> require "parsley"
> Parsley.new({"title" => "h1", "links" => ["a"]}).parse(:url => "http://www.yelp.com/biz/amnesia-san-francisco")
#=> {"title"=>"Amnesia", "links"=>["Yelp", "Welcome", "About Me"]}
We'll also add both explicit and implicit grouping Here's an extension of the previous example with explicit grouping:
@ -81,6 +81,4 @@ If you instead wanted to group by date, you could use implicit grouping. It's i
}]
}
In the next blog article, I'll talk about variables, crawling with dex, dex validations, sharing, and automatic inference of dex scripts from web page structures. Hopefully, you have a taste of what dex scripts can do, and you like it. There's an alpha implementation under active development at []. I'd love to have more collaborators, bug reports, unit tests, docs, encouragement, etc.
</textarea></html>

View File

@ -1,55 +1,41 @@
AM_YFLAGS = -d
BUILT_SOURCES=parser.h
lib_LTLIBRARIES = libdexter.la
libdexter_la_SOURCES = dex_mem.c xml2json.c regexp.c printbuf.c functions.c util.c kstring.c obstack.c scanner.l parser.y dexter.c
include_HEADERS = dexter.h obstack.h xml2json.h
lib_LTLIBRARIES = libparsley.la
libparsley_la_SOURCES = parsley_mem.c xml2json.c regexp.c printbuf.c functions.c util.c kstring.c obstack.c scanner.l parser.y parsley.c
include_HEADERS = parsley.h obstack.h xml2json.h
bin_PROGRAMS = dexterc dexter
bin_PROGRAMS = parsleyc parsley
dexterc_SOURCES = dexterc_main.c
dexterc_LDADD = libdexter.la
parsleyc_SOURCES = parsleyc_main.c
parsleyc_LDADD = libparsley.la
dexter_SOURCES = dexter_main.c
dexter_LDADD = libdexter.la
parsley_SOURCES = parsley_main.c
parsley_LDADD = libparsley.la
bisect:
./bootstrap.sh && ./configure && make clean && make check
port:
make clean
rm -rf /tmp/dexter-`cat VERSION`
cp -R . /tmp/dexter-`cat VERSION`
tar -C /tmp/ --exclude release --exclude .git -zcf "/tmp/dexter-`cat VERSION`.tar.gz" dexter-`cat VERSION`
rsync --progress "/tmp/dexter-`cat VERSION`.tar.gz" kylemaxwell.com:/var/www/kylemaxwell_com/dexter/
cat Portfile.in | sed "s/<VERSION>/`cat VERSION`/" > Portfile
echo "checksums \
md5 `md5 /tmp/dexter-\`cat VERSION\`.tar.gz | sed "s/.*= //"` \
sha1 `openssl sha1 /tmp/dexter-\`cat VERSION\`.tar.gz | sed "s/.*= //"` \
rmd160 `openssl rmd160 /tmp/dexter-\`cat VERSION\`.tar.gz | sed "s/.*= //"`" \
>> Portfile
sudo port build
install-all:
./bootstrap.sh && ./configure && make && make install && cd ruby && rake install && cd ../python && python setup.py install
check-am:
@echo "fictional..."; ./dexter test/fictional.dex test/fictional.html | diff test/fictional.json - && echo " success."
@echo "fictional-opt..."; ./dexter test/fictional-opt.dex test/fictional-opt.html | diff test/fictional-opt.json - && echo " success."
@echo "function-magic..."; ./dexter test/function-magic.dex test/function-magic.html | diff test/function-magic.json - && echo " success."
@echo "malformed-expr..."; ./dexter test/malformed-expr.dex test/malformed-expr.html | diff test/malformed-expr.json - && echo " success."
@echo "malformed-json..."; ./dexter test/malformed-json.dex test/malformed-json.html | diff test/malformed-json.json - && echo " success."
@echo "css_attr..."; ./dexter -x test/css_attr.dex test/css_attr.html | diff test/css_attr.json - && echo " success."
@echo "match..."; ./dexter -x test/match.dex test/match.xml | diff test/match.json - && echo " success."
@echo "position..."; ./dexter test/position.dex test/position.html | diff test/position.json - && echo " success."
@echo "replace..."; ./dexter -x test/replace.dex test/replace.xml | diff test/replace.json - && echo " success."
@echo "scope..."; ./dexter test/scope.dex test/scope.html | diff test/scope.json - && echo " success."
@echo "test..."; ./dexter -x test/test.dex test/test.xml | diff test/test.json - && echo " success."
@echo "yelp..."; ./dexter test/yelp.dex test/yelp.html | diff test/yelp.json - && echo " success."
@echo "optional..."; ./dexter test/optional.dex test/optional.html | diff test/optional.json - && echo " success."
@echo "malformed-function..."; ./dexter test/malformed-function.dex test/malformed-function.html | diff test/malformed-function.json - && echo " success."
@echo "empty..."; ./dexter test/empty.dex test/empty.html | diff test/empty.json - && echo " success."
@echo "trivial..."; ./dexter test/trivial.dex test/trivial.html | diff test/trivial.json - && echo " success."
@echo "trivial2..."; ./dexter test/trivial2.dex test/trivial2.html | diff test/trivial2.json - && echo " success."
@echo "craigs-simple..."; ./dexter test/craigs-simple.dex test/craigs-simple.html | diff test/craigs-simple.json - && echo " success."
@echo "yelp-home..."; ./dexter test/yelp-home.dex test/yelp-home.html | diff test/yelp-home.json - && echo " success."
@echo "fictional..."; ./parsley test/fictional.let test/fictional.html | diff test/fictional.json - && echo " success."
@echo "fictional-opt..."; ./parsley test/fictional-opt.let test/fictional-opt.html | diff test/fictional-opt.json - && echo " success."
@echo "function-magic..."; ./parsley test/function-magic.let test/function-magic.html | diff test/function-magic.json - && echo " success."
@echo "malformed-expr..."; ./parsley test/malformed-expr.let test/malformed-expr.html | diff test/malformed-expr.json - && echo " success."
@echo "malformed-json..."; ./parsley test/malformed-json.let test/malformed-json.html | diff test/malformed-json.json - && echo " success."
@echo "css_attr..."; ./parsley -x test/css_attr.let test/css_attr.html | diff test/css_attr.json - && echo " success."
@echo "match..."; ./parsley -x test/match.let test/match.xml | diff test/match.json - && echo " success."
@echo "position..."; ./parsley test/position.let test/position.html | diff test/position.json - && echo " success."
@echo "replace..."; ./parsley -x test/replace.let test/replace.xml | diff test/replace.json - && echo " success."
@echo "scope..."; ./parsley test/scope.let test/scope.html | diff test/scope.json - && echo " success."
@echo "test..."; ./parsley -x test/test.let test/test.xml | diff test/test.json - && echo " success."
@echo "yelp..."; ./parsley test/yelp.let test/yelp.html | diff test/yelp.json - && echo " success."
@echo "optional..."; ./parsley test/optional.let test/optional.html | diff test/optional.json - && echo " success."
@echo "malformed-function..."; ./parsley test/malformed-function.let test/malformed-function.html | diff test/malformed-function.json - && echo " success."
@echo "empty..."; ./parsley test/empty.let test/empty.html | diff test/empty.json - && echo " success."
@echo "trivial..."; ./parsley test/trivial.let test/trivial.html | diff test/trivial.json - && echo " success."
@echo "trivial2..."; ./parsley test/trivial2.let test/trivial2.html | diff test/trivial2.json - && echo " success."
@echo "craigs-simple..."; ./parsley test/craigs-simple.let test/craigs-simple.html | diff test/craigs-simple.json - && echo " success."
@echo "yelp-home..."; ./parsley test/yelp-home.let test/yelp-home.html | diff test/yelp-home.json - && echo " success."

View File

@ -60,7 +60,7 @@ am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(bindir)" \
libLTLIBRARIES_INSTALL = $(INSTALL)
LTLIBRARIES = $(lib_LTLIBRARIES)
libdexter_la_LIBADD =
am_libdexter_la_OBJECTS = dex_mem.lo xml2json.lo regexp.lo printbuf.lo \
am_libdexter_la_OBJECTS = parsley_mem.lo xml2json.lo regexp.lo printbuf.lo \
functions.lo util.lo kstring.lo obstack.lo scanner.lo \
parser.lo dexter.lo
libdexter_la_OBJECTS = $(am_libdexter_la_OBJECTS)
@ -229,7 +229,7 @@ top_srcdir = @top_srcdir@
AM_YFLAGS = -d
BUILT_SOURCES = parser.h
lib_LTLIBRARIES = libdexter.la
libdexter_la_SOURCES = dex_mem.c xml2json.c regexp.c printbuf.c functions.c util.c kstring.c obstack.c scanner.l parser.y dexter.c
libdexter_la_SOURCES = parsley_mem.c xml2json.c regexp.c printbuf.c functions.c util.c kstring.c obstack.c scanner.l parser.y dexter.c
include_HEADERS = dexter.h obstack.h xml2json.h
dexterc_SOURCES = dexterc_main.c
dexterc_LDADD = libdexter.la
@ -348,7 +348,7 @@ mostlyclean-compile:
distclean-compile:
-rm -f *.tab.c
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dex_mem.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/parsley_mem.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dexter.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dexter_main.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dexterc_main.Po@am__quote@

15
OUTLINE
View File

@ -1,15 +0,0 @@
- what is dex?
- data extraction from xml/html
- current options: hpricot/nokogiri, XSLT, beautiful soup
- selectors + structure
- selectors: xpath + css + functions (xpath+exsl+regex)
h1>a
substring-after(h1, ':')
regexp:match(span.rating, '\d+', '')
//location[obj='some-id']/ancestor::group/@id
html('http://google.com')//title
html(//div/a/@href)//title
- structure: json-by-example
- example: yelp.dex

2
PAPER
View File

@ -1,6 +1,6 @@
Abstract
================================================================
A common programming task is data extraction from xml and html documents. I introduce dex, an embedded language (ala SQL, regular expressions) that improves the usability and/or speed of current extraction techniques.
A common programming task is data extraction from xml and html documents. I introduce parsley, an embedded language (ala SQL, regular expressions) that improves the usability and/or speed of current extraction techniques.
Introduction
================================================================

View File

@ -1,15 +1,15 @@
# -*- coding: utf-8; mode: tcl; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- vim:fenc=utf-8:ft=tcl:et:sw=4:ts=4:sts=4
# $Id$
PortSystem 1.0
name dexter
name parsley
version 0.1.5
categories net
maintainers kyle@kylemaxwell.com
description Data extractor
long_description Dexter is a system to extract data from HTML/XML documents
homepage http://github.com/fizx/dexter
long_description Parsley is a system to extract data from HTML/XML documents
homepage http://github.com/fizx/parsley
platforms darwin
master_sites http://kylemaxwell.com/dexter/
master_sites http://parslets.com
depends_lib port:argp-standalone \
port:json-c \
port:libxslt \

View File

@ -1,15 +1,15 @@
# -*- coding: utf-8; mode: tcl; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- vim:fenc=utf-8:ft=tcl:et:sw=4:ts=4:sts=4
# $Id$
PortSystem 1.0
name dexter
name parsley
version <VERSION>
categories net
maintainers kyle@kylemaxwell.com
description Data extractor
long_description Dexter is a system to extract data from HTML/XML documents
homepage http://github.com/fizx/dexter
long_description Parsley is a system to extract data from HTML/XML documents
homepage http://github.com/fizx/parsley
platforms darwin
master_sites http://kylemaxwell.com/dexter/
master_sites http://parslets.com
depends_lib port:argp-standalone \
port:json-c \
port:libxslt \

View File

@ -1,45 +1,45 @@
To use dexter from C, the following functions are available from dexter.h. In
To use parsley from C, the following functions are available from parsley.h. In
addition, there is a function to convert xml documents of the type returned by
dexter into json.
parsley into json.
You will also need passing familiarity with libxml2 and json-c to print, manipulate, and free some of the generated objects.
- http://svn.metaparadigm.com/svn/json-c/trunk
- http://xmlsoft.org/
From dexter.h
From parsley.h
=============
parsedDexPtr -- a struct that contains the following elements:
- xmlDocPtr xml -- the output of a dex document parse, as a libxml2 document
parsedParsleyPtr -- a struct that contains the following elements:
- xmlDocPtr xml -- the output of a parslet document parse, as a libxml2 document
- char *error -- an error message, or NULL if no error
- compiled_dex *dex -- reference to the dex that did the parsing
- compiled_parsley *parsley -- reference to the parsley that did the parsing
dexPtr dex_compile(char* dex, char* incl)
parsleyPtr parsley_compile(char* parsley, char* incl)
Arguments:
- char* dex -- a string of dex to compile.
- char* parsley -- a string of parsley to compile.
- char* incl -- arbitrary XSLT to inject directly into the stylesheet,
outside any templates.
Returns: A structure that you can pass to dex_parse_* to do the actual
Returns: A structure that you can pass to parsley_parse_* to do the actual
parsing. This structure contains the compiled XSLT.
Notes: This is *NOT* thread-safe. (Usage of the dex via dex_parse_* *IS*
Notes: This is *NOT* thread-safe. (Usage of the parslet via parsley_parse_* *IS*
thread-safe, however.)
void dex_free(dexPtr);
void parsley_free(parsleyPtr);
Frees the dexPtr's memory.
Frees the parsleyPtr's memory.
void parsed_dex_free(parsedDexPtr);
void parsed_parsley_free(parsedParsleyPtr);
Frees the parsedDexPtr's memory.
Frees the parsedParsleyPtr's memory.
parsedDexPtr dex_parse_file(dexPtr dex, char* file_name, boolean html);
parsedParsleyPtr parsley_parse_file(parsleyPtr parsley, char* file_name, boolean html);
Arguments:
- dexPtr dex -- Compiled dex struct
- parsleyPtr parsley -- Compiled parsley struct
- char* file_name -- file to parse
- boolean html -- Use the html parser? (instead of xml)
@ -48,14 +48,14 @@ parsedDexPtr dex_parse_file(dexPtr dex, char* file_name, boolean html);
like xmlSaveFormatFile(). If you want json output, look below for xml2json
docs.
parsedDexPtr dex_parse_string(dexPtr dex, char* string, size_t len, boolean html);
parsedParsleyPtr parsley_parse_string(parsleyPtr parsley, char* string, size_t len, boolean html);
Parses the in-memory string/length combination given. See dex_parse_file
Parses the in-memory string/length combination given. See parsley_parse_file
docs.
parsedDexPtr dex_parse_doc(dexPtr dex, xmlDocPtr doc);
parsedParsleyPtr parsley_parse_doc(parsleyPtr parsley, xmlDocPtr doc);
Uses the dex parser to parse a libxml2 document.
Uses the parsley parser to parse a libxml2 document.
From xml2json.h
===============
@ -63,13 +63,13 @@ From xml2json.h
struct json_object * xml2json(xmlNodePtr);
Converts an xml subtree to json. The xml should be in the format returned
by dexter. Basically, xml attributes get ignored, and if you want an array
by parsley. Basically, xml attributes get ignored, and if you want an array
like [a,b], use:
<dex:groups>
<dex:group>a</dex:group>
<dex:group>b</dex:group>
</dex:groups>
<parsley:groups>
<parsley:group>a</parsley:group>
<parsley:group>b</parsley:group>
</parsley:groups>
To get a null-terminated string out, use:

View File

@ -1,14 +1,14 @@
<h3>Overview<a name="start-readme">&nbsp;</a></h3>
Dexter is a simple language for data-extraction from XML-like documents (including HTML). Dexter is:
parsley is a simple language for data-extraction from XML-like documents (including HTML). parsley is:
1. Blazing fast -- Typical HTML parses are sub-50ms.
2. Easy to write and understand -- Dexter uses your current knowledge of JSON, CSS, and XPath.
3. Powerful. Dexter can understand full XPath, including standard and user-defined functions.
2. Easy to write and understand -- parsley uses your current knowledge of JSON, CSS, and XPath.
3. Powerful. parsley can understand full XPath, including standard and user-defined functions.
### Examples
A simple script, or "dex", looks like this:
A simple script, or "parslet", looks like this:
{
"title": "h1",
@ -20,7 +20,7 @@ A simple script, or "dex", looks like this:
]
}
This returns JSON or XML output with the same structure. Applying this dex to http://www.yelp.com/biz/amnesia-san-francisco yields either:
This returns JSON or XML output with the same structure. Applying this parslet to http://www.yelp.com/biz/amnesia-san-francisco yields either:
{
"title": "Amnesia",
@ -43,26 +43,26 @@ This returns JSON or XML output with the same structure. Applying this dex to h
or equivalently:
<dexter:root>
<parsley:root>
<title>Amnesia</title>
<links>
<dexter:group>
<parsley:group>
<href>/</href>
<text>Yelp</text>
</dexter:group>
<dexter:group>
</parsley:group>
<parsley:group>
<href>/</href>
<text>Welcome</text>
</dexter:group>
<dexter:group>
</parsley:group>
<parsley:group>
<href>/signup?return_url=%2Fuser_details</href>
<text> About Me</text>
</dexter:group>
</parsley:group>
.....
</links>
</dexter:root>
</parsley:root>
This dex could also have been expressed as:
This parslet could also have been expressed as:
{
"title": "h1",
@ -74,7 +74,7 @@ This dex could also have been expressed as:
]
}
The "a" in links(a) is a "key selector" -- an explicit grouping (with scope) for the array. You can use any XPath 1.0 or CSS3 expression as a value or a key selector. Dexter will try to be smart, and figure out which you are using. You can use CSS selectors inside XPath functions -- "substring-after(h1>a, ':')" is a valid expression.
The "a" in links(a) is a "key selector" -- an explicit grouping (with scope) for the array. You can use any XPath 1.0 or CSS3 expression as a value or a key selector. Parsley will try to be smart, and figure out which you are using. You can use CSS selectors inside XPath functions -- "substring-after(h1>a, ':')" is a valid expression.
### Variables

4
TODO
View File

@ -4,8 +4,8 @@
# - define stable c api
# - p/br support explicit
# - p/br support needs div?!
# - dex_parse_url support
# - ruby binding for dex_parse_url
# - parsley_parse_url support
# - ruby binding for parsley_parse_url
# - relative urls
# - p/br support needs multicase handling
# - reorganize project (at least tests, makefile.am src?!)

View File

@ -17,8 +17,8 @@
#include "functions.h"
void dex_register_all(){
xsltRegisterExtModuleFunction ((const xmlChar *) "html-document", "http://kylemaxwell.com/dexter/library",
void parsley_register_all(){
xsltRegisterExtModuleFunction ((const xmlChar *) "html-document", "http://parslets.com/stdlib",
xsltHtmlDocumentFunction);
}

View File

@ -1,5 +1,5 @@
#ifndef DEX_FUNCTIONS_H_INCLUDED
#define DEX_FUNCTIONS_H_INCLUDED
#ifndef PARSLEY_FUNCTIONS_H_INCLUDED
#define PARSLEY_FUNCTIONS_H_INCLUDED
#include <libxml/xpath.h>
#include <libxml/HTMLparser.h>
@ -8,7 +8,7 @@
#include <libxslt/transform.h>
#include <libxslt/documents.h>
void dex_register_all();
void parsley_register_all();
static void xsltHtmlDocumentFunction(xmlXPathParserContextPtr, int);
static void xsltHtmlDocumentFunctionLoadDocument(xmlXPathParserContextPtr, xmlChar*);

View File

@ -4,8 +4,8 @@
#include <stdarg.h>
#include "kstring.h"
#include "printbuf.h"
#include "dex_mem.h"
#include "dexter.h"
#include "parsley_mem.h"
#include "parsley.h"
char* arepl(char* orig, char* old, char* new) {
// printf("y\n");
@ -32,55 +32,55 @@ char* astrdup(char* c) {
}
char* astrcat(char* a, char* b) {
char* output = (char*) dex_alloc(sizeof(char) * (strlen(a) + strlen(b) + 1));
char* output = (char*) parsley_alloc(sizeof(char) * (strlen(a) + strlen(b) + 1));
sprintf(output, "%s%s", a, b);
return output;
}
char* astrcat3(char* a, char* b, char* c) {
char* output = (char*) dex_alloc(sizeof(char) * (strlen(a) + strlen(b) + strlen(c) + 1));
char* output = (char*) parsley_alloc(sizeof(char) * (strlen(a) + strlen(b) + strlen(c) + 1));
sprintf(output, "%s%s%s", a, b, c);
return output;
}
char* astrcat4(char* a, char* b, char* c, char* d) {
char* output = (char*) dex_alloc(sizeof(char) * (strlen(a) + strlen(b) + strlen(c) + strlen(d) + 1));
char* output = (char*) parsley_alloc(sizeof(char) * (strlen(a) + strlen(b) + strlen(c) + strlen(d) + 1));
sprintf(output, "%s%s%s%s", a, b, c, d);
return output;
}
char* astrcat5(char* a, char* b, char* c, char* d, char* e) {
char* output = (char*) dex_alloc(sizeof(char) * (strlen(a) + strlen(b) + strlen(c) + strlen(d) + strlen(e) + 1));
char* output = (char*) parsley_alloc(sizeof(char) * (strlen(a) + strlen(b) + strlen(c) + strlen(d) + strlen(e) + 1));
sprintf(output, "%s%s%s%s%s", a, b, c, d, e);
return output;
}
char* astrcat6(char* a, char* b, char* c, char* d, char* e, char* f) {
char* output = (char*) dex_alloc(sizeof(char) * (strlen(a) + strlen(b) + strlen(c) + strlen(d) + strlen(e) + strlen(f) + 1));
char* output = (char*) parsley_alloc(sizeof(char) * (strlen(a) + strlen(b) + strlen(c) + strlen(d) + strlen(e) + strlen(f) + 1));
sprintf(output, "%s%s%s%s%s%s", a, b, c, d, e, f);
return output;
}
char* astrcat7(char* a, char* b, char* c, char* d, char* e, char* f, char* g) {
char* output = (char*) dex_alloc(sizeof(char) * (strlen(a) + strlen(b) + strlen(c) + strlen(d) + strlen(e) + strlen(f) + strlen(g) + 1));
char* output = (char*) parsley_alloc(sizeof(char) * (strlen(a) + strlen(b) + strlen(c) + strlen(d) + strlen(e) + strlen(f) + strlen(g) + 1));
sprintf(output, "%s%s%s%s%s%s%s", a, b, c, d, e, f, g);
return output;
}
char* astrcat8(char* a, char* b, char* c, char* d, char* e, char* f, char* g, char* h) {
char* output = (char*) dex_alloc(sizeof(char) * (strlen(a) + strlen(b) + strlen(c) + strlen(d) + strlen(e) + strlen(f) + strlen(g) + strlen(h) + 1));
char* output = (char*) parsley_alloc(sizeof(char) * (strlen(a) + strlen(b) + strlen(c) + strlen(d) + strlen(e) + strlen(f) + strlen(g) + strlen(h) + 1));
sprintf(output, "%s%s%s%s%s%s%s%s", a, b, c, d, e, f, g, h);
return output;
}
char* astrcat9(char* a, char* b, char* c, char* d, char* e, char* f, char* g, char* h, char* i) {
char* output = (char*) dex_alloc(sizeof(char) * (strlen(a) + strlen(b) + strlen(c) + strlen(d) + strlen(e) + strlen(f) + strlen(g) + strlen(h) + strlen(i) + 1));
char* output = (char*) parsley_alloc(sizeof(char) * (strlen(a) + strlen(b) + strlen(c) + strlen(d) + strlen(e) + strlen(f) + strlen(g) + strlen(h) + strlen(i) + 1));
sprintf(output, "%s%s%s%s%s%s%s%s%s", a, b, c, d, e, f, g, h, i);
return output;
}
char* astrcat10(char* a, char* b, char* c, char* d, char* e, char* f, char* g, char* h, char* i, char* j) {
char* output = (char*) dex_alloc(sizeof(char) * (strlen(a) + strlen(b) + strlen(c) + strlen(d) + strlen(e) + strlen(f) + strlen(g) + strlen(h) + strlen(i) + strlen(j) + 1));
char* output = (char*) parsley_alloc(sizeof(char) * (strlen(a) + strlen(b) + strlen(c) + strlen(d) + strlen(e) + strlen(f) + strlen(g) + strlen(h) + strlen(i) + strlen(j) + 1));
sprintf(output, "%s%s%s%s%s%s%s%s%s%s", a, b, c, d, e, f, g, h, i, j);
return output;
}

View File

@ -564,11 +564,11 @@ char* xpath_alias(char* key) {
void init_xpath_alias() {
alias_hash = xmlHashCreate(100);
xmlHashAddEntry(alias_hash, "html", "dex:html-document");
xmlHashAddEntry(alias_hash, "html", "lib:html-document");
xmlHashAddEntry(alias_hash, "match", "regexp:match");
xmlHashAddEntry(alias_hash, "replace", "regexp:replace");
xmlHashAddEntry(alias_hash, "test", "regexp:test");
xmlHashAddEntry(alias_hash, "with-newlines", "dex:nl");
xmlHashAddEntry(alias_hash, "with-newlines", "lib:nl");
}

136
parsley.c
View File

@ -4,7 +4,7 @@
#include <stdarg.h>
#include <json/json.h>
#include "kstring.h"
#include "dexter.h"
#include "parsley.h"
#include "y.tab.h"
#include "printbuf.h"
#include "functions.h"
@ -26,14 +26,14 @@ int yywrap(void){
return 1;
}
void parsed_dex_free(parsedDexPtr ptr) {
void parsed_parsley_free(parsedParsleyPtr ptr) {
if(ptr->xml != NULL) xmlFree(ptr->xml);
if(ptr->error != NULL) free(ptr->error);
free(ptr);
}
static parsedDexPtr parse_error(char* format, ...) {
parsedDexPtr ptr = (parsedDexPtr) calloc(sizeof(parsed_dex), 1);
static parsedParsleyPtr parse_error(char* format, ...) {
parsedParsleyPtr ptr = (parsedParsleyPtr) calloc(sizeof(parsed_parsley), 1);
ptr->xml = NULL;
va_list args;
va_start(args, format);
@ -42,33 +42,33 @@ static parsedDexPtr parse_error(char* format, ...) {
return ptr;
}
parsedDexPtr dex_parse_file(dexPtr dex, char* file, bool html) {
parsedParsleyPtr parsley_parse_file(parsleyPtr parsley, char* file, bool html) {
if(html) {
htmlParserCtxtPtr htmlCtxt = htmlNewParserCtxt();
htmlDocPtr html = htmlCtxtReadFile(htmlCtxt, file, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
htmlFreeParserCtxt(htmlCtxt);
if(html == NULL) return parse_error("Couldn't parse file: %s\n", file);
return dex_parse_doc(dex, html);
return parsley_parse_doc(parsley, html);
} else {
xmlParserCtxtPtr ctxt = xmlNewParserCtxt();
xmlDocPtr xml = xmlCtxtReadFile(ctxt, file, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
xmlFreeParserCtxt(ctxt);
if(xml == NULL) return parse_error("Couldn't parse file: %s\n", file);
return dex_parse_doc(dex, xml);
return parsley_parse_doc(parsley, xml);
}
}
parsedDexPtr dex_parse_string(dexPtr dex, char* string, size_t size, bool html) {
parsedParsleyPtr parsley_parse_string(parsleyPtr parsley, char* string, size_t size, bool html) {
if(html) {
htmlParserCtxtPtr htmlCtxt = htmlNewParserCtxt();
htmlDocPtr html = htmlCtxtReadMemory(htmlCtxt, string, size, "http://kylemaxwell.com/dexter/memory", NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
htmlDocPtr html = htmlCtxtReadMemory(htmlCtxt, string, size, "http://parslets.com/in-memory-string", NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
if(html == NULL) return parse_error("Couldn't parse string");
return dex_parse_doc(dex, html);
return parsley_parse_doc(parsley, html);
} else {
xmlParserCtxtPtr ctxt = xmlNewParserCtxt();
xmlDocPtr xml = xmlCtxtReadMemory(ctxt, string, size, "http://kylemaxwell.com/dexter/memory", NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
xmlDocPtr xml = xmlCtxtReadMemory(ctxt, string, size, "http://parslets.com/in-memory-string", NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
if(xml == NULL) return parse_error("Couldn't parse string");
return dex_parse_doc(dex, xml);
return parsley_parse_doc(parsley, xml);
}
}
@ -101,11 +101,11 @@ unlink(xmlNodePtr xml) {
static bool
is_root(xmlElementPtr xml) {
return xml != NULL && xml->name != NULL && xml->prefix !=NULL && !strcmp(xml->name, "root") && !strcmp(xml->prefix, "dexter");
return xml != NULL && xml->name != NULL && xml->prefix !=NULL && !strcmp(xml->name, "root") && !strcmp(xml->prefix, "parsley");
}
static void
prune(parsedDexPtr ptr, xmlNodePtr xml, char* err) {
prune(parsedParsleyPtr ptr, xmlNodePtr xml, char* err) {
if(xml == NULL) return;
bool optional = ((xmlElementPtr )xml)->attributes != NULL;
if(optional) {
@ -123,7 +123,7 @@ prune(parsedDexPtr ptr, xmlNodePtr xml, char* err) {
}
static void
visit(parsedDexPtr ptr, xmlNodePtr xml, bool bubbling) {
visit(parsedParsleyPtr ptr, xmlNodePtr xml, bool bubbling) {
if(xml->type != XML_ELEMENT_NODE) return;
xmlNodePtr child = xml->children;
xmlNodePtr parent = xml->parent;
@ -146,10 +146,10 @@ xml_empty(xmlNodePtr xml) {
return true;
}
parsedDexPtr dex_parse_doc(dexPtr dex, xmlDocPtr doc) {
parsedDexPtr ptr = (parsedDexPtr) calloc(sizeof(parsed_dex), 1);
ptr->dex = dex;
ptr->xml = xsltApplyStylesheet(dex->stylesheet, doc, NULL);
parsedParsleyPtr parsley_parse_doc(parsleyPtr parsley, xmlDocPtr doc) {
parsedParsleyPtr ptr = (parsedParsleyPtr) calloc(sizeof(parsed_parsley), 1);
ptr->parsley = parsley;
ptr->xml = xsltApplyStylesheet(parsley->stylesheet, doc, NULL);
if(ptr->xml != NULL && ptr->error == NULL) visit(ptr, ptr->xml->children, false);
if(ptr->xml == NULL && ptr->error == NULL) { // == NULL
ptr->error = strdup("Internal runtime error");
@ -157,57 +157,57 @@ parsedDexPtr dex_parse_doc(dexPtr dex, xmlDocPtr doc) {
return ptr;
}
dexPtr dex_compile(char* dex_str, char* incl) {
dexPtr dex = (dexPtr) calloc(sizeof(compiled_dex), 1);
parsleyPtr parsley_compile(char* parsley_str, char* incl) {
parsleyPtr parsley = (parsleyPtr) calloc(sizeof(compiled_parsley), 1);
if(last_dex_error != NULL) {
free(last_dex_error);
last_dex_error = NULL;
if(last_parsley_error != NULL) {
free(last_parsley_error);
last_parsley_error = NULL;
}
registerEXSLT();
struct json_object *json = json_tokener_parse(dex_str);
struct json_object *json = json_tokener_parse(parsley_str);
if(is_error(json)) {
dex->error = strdup("Your dex is not valid json.");
parsley->error = strdup("Your parslet is not valid json.");
// json_object_put(json); // frees json
return dex;
return parsley;
}
struct printbuf* buf = printbuf_new();
sprintbuf_dex_header(buf);
sprintbuf_parsley_header(buf);
sprintbuf(buf, "%s\n", incl);
sprintbuf(buf, "<xsl:template match=\"/\">\n");
sprintbuf(buf, "<dexter:root>\n");
sprintbuf(buf, "<parsley:root>\n");
contextPtr context = new_context(json, buf);
__dex_recurse(context);
__parsley_recurse(context);
json_object_put(json); // frees json
dex->error = last_dex_error;
parsley->error = last_parsley_error;
sprintbuf(buf, "</dexter:root>\n");
sprintbuf(buf, "</parsley:root>\n");
sprintbuf(buf, "</xsl:template>\n");
sprintbuf(buf, context->key_buf->buf);
sprintbuf(buf, "</xsl:stylesheet>\n");
if(dex->error == NULL) {
if(parsley->error == NULL) {
xmlParserCtxtPtr ctxt = xmlNewParserCtxt();
xmlDocPtr doc = xmlCtxtReadMemory(ctxt, buf->buf, buf->size, "http://kylemaxwell.com/dexter/compiled", NULL, 3);
xmlDocPtr doc = xmlCtxtReadMemory(ctxt, buf->buf, buf->size, "http://kylemaxwell.com/parsley/compiled", NULL, 3);
xmlFreeParserCtxt(ctxt);
dex->raw_stylesheet = strdup(buf->buf);
dex->stylesheet = xsltParseStylesheetDoc(doc);
parsley->raw_stylesheet = strdup(buf->buf);
parsley->stylesheet = xsltParseStylesheetDoc(doc);
}
printbuf_free(buf);
dex_collect();
parsley_collect();
return dex;
return parsley;
}
static contextPtr new_context(struct json_object * json, struct printbuf *buf) {
contextPtr c = dex_alloc(sizeof(dex_context));
contextPtr c = parsley_alloc(sizeof(parsley_context));
c->key_buf = printbuf_new();
sprintbuf(c->key_buf, "");
c->name = "root";
@ -228,17 +228,17 @@ static contextPtr new_context(struct json_object * json, struct printbuf *buf) {
}
contextPtr deeper_context(contextPtr context, char* key, struct json_object * val) {
contextPtr c = dex_alloc(sizeof(dex_context));
contextPtr c = parsley_alloc(sizeof(parsley_context));
c->key_buf = context->key_buf;
c->keys = context->keys;
c->tag = dex_key_tag(key);
c->flags = dex_key_flags(key);
c->tag = parsley_key_tag(key);
c->flags = parsley_key_flags(key);
c->name = astrcat3(context->name, ".", c->tag);
dex_parsing_context = c;
parsley_parsing_context = c;
c->array = val != NULL && json_object_is_type(val, json_type_array);
c->json = c->array ? json_object_array_get_idx(val, 0) : val;
c->string = val != NULL && json_object_is_type(c->json, json_type_string);
c->filter = dex_key_filter(key);
c->filter = parsley_key_filter(key);
c->magic = ((c->filter == NULL) && c->array && !(c->string)) ? c->name : context->magic;
if(context->filter != NULL && !c->array) c->magic = NULL;
c->buf = context->buf;
@ -259,7 +259,7 @@ static char* filter_intersection(char* key, char* expr) {
}
}
void dex_free(dexPtr ptr) {
void parsley_free(parsleyPtr ptr) {
if(ptr->error != NULL)
free(ptr->error);
if(ptr->raw_stylesheet != NULL)
@ -271,14 +271,14 @@ void dex_free(dexPtr ptr) {
void yyerror(const char * s) {
struct printbuf *buf = printbuf_new();
if(last_dex_error !=NULL) sprintbuf(buf, "%s\n", last_dex_error);
sprintbuf(buf, "%s in key: %s", s, dex_parsing_context->name);
last_dex_error = strdup(buf->buf);
if(last_parsley_error !=NULL) sprintbuf(buf, "%s\n", last_parsley_error);
sprintbuf(buf, "%s in key: %s", s, parsley_parsing_context->name);
last_parsley_error = strdup(buf->buf);
printbuf_free(buf);
}
static char* optional(contextPtr c) {
return (c->flags & DEX_OPTIONAL) ? " optional=\"true\"" : "";
return (c->flags & PARSLEY_OPTIONAL) ? " optional=\"true\"" : "";
}
static bool
@ -289,7 +289,7 @@ all_strings(struct json_object * json) {
return true;
}
void __dex_recurse(contextPtr context) {
void __parsley_recurse(contextPtr context) {
// printf("a\n");
char* tmp;
struct printbuf * buf;
@ -303,14 +303,14 @@ void __dex_recurse(contextPtr context) {
if(c->array || context->zipped) {
if(c->filter){
// printf("b\n");
sprintbuf(c->buf, "<dexter:groups optional=\"true\"><xsl:for-each select=\"%s\"><dexter:group optional=\"true\">\n", c->filter);
sprintbuf(c->buf, "<parsley:groups optional=\"true\"><xsl:for-each select=\"%s\"><parsley:group optional=\"true\">\n", c->filter);
sprintbuf(c->buf, "<xsl:value-of select=\"%s\" />\n", c->raw_expr);
sprintbuf(c->buf, "</dexter:group></xsl:for-each></dexter:groups>\n");
sprintbuf(c->buf, "</parsley:group></xsl:for-each></parsley:groups>\n");
} else {
// printf("c\n");
sprintbuf(c->buf, "<dexter:groups optional=\"true\"><xsl:for-each select=\"%s\"><dexter:group optional=\"true\">\n", c->expr);
sprintbuf(c->buf, "<parsley:groups optional=\"true\"><xsl:for-each select=\"%s\"><parsley:group optional=\"true\">\n", c->expr);
sprintbuf(c->buf, "<xsl:value-of select=\".\" />\n");
sprintbuf(c->buf, "</dexter:group></xsl:for-each></dexter:groups>\n");
sprintbuf(c->buf, "</parsley:group></xsl:for-each></parsley:groups>\n");
}
} else {
if(c->filter){
@ -327,28 +327,28 @@ void __dex_recurse(contextPtr context) {
if(c->array) { // scoped
if(c->filter != NULL) {
// printf("f\n");
sprintbuf(c->buf, "<dexter:groups optional=\"true\"><xsl:for-each select=\"%s\"><dexter:group optional=\"true\">\n", c->filter);
__dex_recurse(c);
sprintbuf(c->buf, "</dexter:group></xsl:for-each></dexter:groups>\n");
sprintbuf(c->buf, "<parsley:groups optional=\"true\"><xsl:for-each select=\"%s\"><parsley:group optional=\"true\">\n", c->filter);
__parsley_recurse(c);
sprintbuf(c->buf, "</parsley:group></xsl:for-each></parsley:groups>\n");
} else { // magic
if(all_strings(c->json)) {
c->magic = NULL;
c->zipped = 1;
sprintbuf(c->buf, "<dexter:zipped>\n");
__dex_recurse(c);
sprintbuf(c->buf, "</dexter:zipped>\n");
sprintbuf(c->buf, "<parsley:zipped>\n");
__parsley_recurse(c);
sprintbuf(c->buf, "</parsley:zipped>\n");
} else {
// printf("h\n");
sprintbuf(c->buf, "<xsl:variable name=\"%s__context\" select=\".\"/>\n", c->name);
dex_parsing_context = c;
parsley_parsing_context = c;
char * str = inner_key_of(c->json);
if(str != NULL) {
// printf("i\n");
tmp = myparse(astrdup(str));
sprintbuf(c->buf, "<dexter:groups optional=\"true\"><xsl:for-each select=\"%s\">\n", filter_intersection(context->magic, tmp));
sprintbuf(c->buf, "<parsley:groups optional=\"true\"><xsl:for-each select=\"%s\">\n", filter_intersection(context->magic, tmp));
// keys
keys = dex_alloc(sizeof(key_node));
keys = parsley_alloc(sizeof(key_node));
keys->name = c->name;
keys->use = full_expr(c, tmp);
keys->next = c->keys;
@ -371,20 +371,20 @@ void __dex_recurse(contextPtr context) {
);
sprintbuf(c->buf, "<xsl:variable name=\"%s__index\" select=\"%s\"/>\n", c->name, tmp);
sprintbuf(c->buf, "<xsl:for-each select=\"$%s__context\"><dexter:group optional=\"true\">\n", c->name);
__dex_recurse(c);
sprintbuf(c->buf, "</dexter:group></xsl:for-each></xsl:for-each></dexter:groups>\n");
sprintbuf(c->buf, "<xsl:for-each select=\"$%s__context\"><parsley:group optional=\"true\">\n", c->name);
__parsley_recurse(c);
sprintbuf(c->buf, "</parsley:group></xsl:for-each></xsl:for-each></parsley:groups>\n");
}
}
}
} else {
// printf("j\n");
if(c->filter == NULL) {
__dex_recurse(c);
__parsley_recurse(c);
} else {
// printf("k\n");
sprintbuf(c->buf, "<xsl:for-each select=\"%s\"><xsl:if test=\"position() = 1\">\n", c->filter);
__dex_recurse(c);
__parsley_recurse(c);
sprintbuf(c->buf, "</xsl:if></xsl:for-each>\n");
}
}

View File

@ -1,7 +1,7 @@
#ifndef DEXTER_H_INCLUDED
#define DEXTER_H_INCLUDED
#ifndef PARSLEY_H_INCLUDED
#define PARSLEY_H_INCLUDED
#define DEX_BUF_SIZE 1024
#define PARSLEY_BUF_SIZE 1024
#include <stdbool.h>
#include <libxslt/xslt.h>
@ -9,25 +9,25 @@
#include <libxslt/transform.h>
static int dex_debug_mode = 0;
static char* last_dex_error;
static int parsley_debug_mode = 0;
static char* last_parsley_error;
#include <json/json.h>
typedef struct __compiled_dex {
typedef struct __compiled_parsley {
char* raw_stylesheet;
xsltStylesheetPtr stylesheet;
char* error;
} compiled_dex;
} compiled_parsley;
typedef struct __parsed_dex {
typedef struct __parsed_parsley {
xmlDocPtr xml;
char *error;
compiled_dex *dex;
} parsed_dex;
compiled_parsley *parsley;
} parsed_parsley;
typedef compiled_dex * dexPtr;
typedef parsed_dex * parsedDexPtr;
typedef compiled_parsley * parsleyPtr;
typedef parsed_parsley * parsedParsleyPtr;
typedef struct __key_node {
char* name;
@ -37,12 +37,12 @@ typedef struct __key_node {
typedef key_node * keyPtr;
typedef struct __dex_context {
typedef struct __parsley_context {
struct printbuf * buf;
struct printbuf * key_buf;
keyPtr keys;
struct json_object * json;
struct __dex_context * parent;
struct __parsley_context * parent;
char* tag;
char* filter;
char* expr;
@ -54,23 +54,23 @@ typedef struct __dex_context {
int string;
int flags;
int zipped;
} dex_context;
} parsley_context;
typedef dex_context * contextPtr;
typedef parsley_context * contextPtr;
void parsed_dex_free(parsedDexPtr);
void parsed_parsley_free(parsedParsleyPtr);
void dex_free(dexPtr);
dexPtr dex_compile(char* dex, char* incl);
parsedDexPtr dex_parse_file(dexPtr, char*, bool);
parsedDexPtr dex_parse_string(dexPtr, char*, size_t, bool);
parsedDexPtr dex_parse_doc(dexPtr, xmlDocPtr);
void parsley_free(parsleyPtr);
parsleyPtr parsley_compile(char* parsley, char* incl);
parsedParsleyPtr parsley_parse_file(parsleyPtr, char*, bool);
parsedParsleyPtr parsley_parse_string(parsleyPtr, char*, size_t, bool);
parsedParsleyPtr parsley_parse_doc(parsleyPtr, xmlDocPtr);
enum {
DEX_OPTIONAL = 1,
PARSLEY_OPTIONAL = 1,
};
static contextPtr dex_parsing_context;
static contextPtr parsley_parsing_context;
static char* full_expr(contextPtr, char*);
static char* expr_join(char*, char*);
@ -84,13 +84,13 @@ static contextPtr tagged_context(contextPtr, char*);
static contextPtr new_context(struct json_object *, struct printbuf *);
static contextPtr deeper_context(contextPtr, char*, struct json_object *);
static void __dex_recurse(contextPtr);
static void __parsley_recurse(contextPtr);
static char* filter_intersection(char*, char*);
static char* inner_key_of(struct json_object *);
static char* inner_key_each(struct json_object *);
static void visit(parsedDexPtr ptr, xmlNodePtr xml, bool bubbling);
static void visit(parsedParsleyPtr ptr, xmlNodePtr xml, bool bubbling);
static bool xml_empty(xmlNodePtr xml);
#endif

View File

@ -3,7 +3,7 @@
#include <string.h>
#include "kstring.h"
#include "printbuf.h"
#include "dexter.h"
#include "parsley.h"
#include "xml2json.h"
#include <libxslt/xslt.h>
#include <libxslt/xsltInternals.h>
@ -21,7 +21,7 @@ struct arguments
struct list_elem *include_files;
int input_xml;
int output_xml;
char *dex;
char *parsley;
char *input_file;
char *output_file;
};
@ -32,10 +32,10 @@ struct list_elem {
char *string;
};
const char *argp_program_version = "dexter 0.1";
const char *argp_program_version = "parsley 0.1";
const char *argp_program_bug_address = "<kyle@kylemaxwell.com>";
static char args_doc[] = "DEX_FILE FILE_TO_PARSE";
static char doc[] = "Dexter is a dex parser.";
static char doc[] = "Parsley is a parslet parser.";
static struct argp_option options[] = {
{"input-xml", 'x', 0, 0, "Use the XML parser (not HTML)" },
@ -72,7 +72,7 @@ static error_t parse_opt (int key, char *arg, struct argp_state *state)
case ARGP_KEY_ARG:
switch(state->arg_num){
case 0:
arguments->dex = arg;
arguments->parsley = arg;
break;
case 1:
arguments->input_file = arg;
@ -106,23 +106,23 @@ int main (int argc, char **argv) {
struct printbuf *buf = printbuf_new();
struct printbuf *incl = printbuf_new();
FILE * fd = dex_fopen(arguments.dex, "r");
FILE * fd = parsley_fopen(arguments.parsley, "r");
printbuf_file_read(fd, buf);
while(elemptr->has_next) {
elemptr = elemptr->next;
FILE* f = dex_fopen(elemptr->string, "r");
FILE* f = parsley_fopen(elemptr->string, "r");
printbuf_file_read(f, incl);
fclose(f);
}
dexPtr compiled = dex_compile(buf->buf, incl->buf);
parsleyPtr compiled = parsley_compile(buf->buf, incl->buf);
if(compiled->error != NULL) {
fprintf(stderr, "%s\n", compiled->error);
exit(1);
}
parsedDexPtr ptr = dex_parse_file(compiled, arguments.input_file, !(arguments.input_xml));
parsedParsleyPtr ptr = parsley_parse_file(compiled, arguments.input_file, !(arguments.input_xml));
if(ptr->error != NULL) {
fprintf(stderr, "Parsing failed: %s\n", ptr->error);
@ -133,7 +133,7 @@ int main (int argc, char **argv) {
xmlSaveFormatFile(arguments.output_file, ptr->xml, 1);
} else {
struct json_object *json = xml2json(ptr->xml->children->children);
FILE* f = dex_fopen(arguments.output_file, "w");
FILE* f = parsley_fopen(arguments.output_file, "w");
fprintf(f, "%s\n", json_object_to_json_string(json));
fclose(f);
}

View File

@ -1,22 +1,22 @@
#include "dex_mem.h"
#include "parsley_mem.h"
#include <stdlib.h>
#include <stdbool.h>
#include <stdio.h>
static struct obstack dex_obstack;
static bool dex_obstack_initialized = false;
static struct obstack parsley_obstack;
static bool parsley_obstack_initialized = false;
void dex_collect() {
obstack_free(&dex_obstack, NULL);
obstack_init(&dex_obstack);
void parsley_collect() {
obstack_free(&parsley_obstack, NULL);
obstack_init(&parsley_obstack);
}
void * dex_alloc(int size) {
if(!dex_obstack_initialized) {
obstack_init(&dex_obstack);
dex_obstack_initialized = true;
void * parsley_alloc(int size) {
if(!parsley_obstack_initialized) {
obstack_init(&parsley_obstack);
parsley_obstack_initialized = true;
}
void * mem = obstack_alloc(&dex_obstack, size);
void * mem = obstack_alloc(&parsley_obstack, size);
void * ptr = mem;
for(int i = 0; i < size; i++)
{

View File

@ -1,13 +1,13 @@
#ifndef DEX_MEM_H_INCLUDED
#define DEX_MEM_H_INCLUDED
#ifndef PARSLEY_MEM_H_INCLUDED
#define PARSLEY_MEM_H_INCLUDED
#define obstack_chunk_alloc malloc
#define obstack_chunk_free free
#include "obstack.h"
void dex_collect();
void * dex_alloc(int size);
void parsley_collect();
void * parsley_alloc(int size);
#endif

View File

@ -4,7 +4,7 @@
#include <string.h>
#include "kstring.h"
#include "printbuf.h"
#include "dexter.h"
#include "parsley.h"
#include "util.h"
struct list_elem {
@ -16,14 +16,14 @@ struct list_elem {
struct arguments
{
struct list_elem *include_files;
char *dex;
char *parsley;
char *output_file;
};
const char *argp_program_version = "dexterc 0.1";
const char *argp_program_version = "parsleyc 0.1";
const char *argp_program_bug_address = "<kyle@kylemaxwell.com>";
static char args_doc[] = "DEX_FILE";
static char doc[] = "Dexter is a dex to XSLT compiler";
static char doc[] = "Parsleyc is a parslet to XSLT compiler";
static struct argp_option options[] = {
{"debug", 'd', 0, 0, "Turn on Bison parser debugging" },
@ -48,14 +48,14 @@ static error_t parse_opt (int key, char *arg, struct argp_state *state)
base->has_next = 1;
break;
case 'd':
// dex_set_debug_mode(1);
// parsley_set_debug_mode(1);
break;
case 'o':
arguments->output_file = arg;
break;
case ARGP_KEY_ARG:
if (state->arg_num >= 1) argp_usage (state);
arguments->dex = arg;
arguments->parsley = arg;
break;
case ARGP_KEY_END:
if (state->arg_num < 1) argp_usage (state);
@ -76,29 +76,29 @@ int main (int argc, char **argv) {
arguments.include_files = elemptr;
arguments.output_file = "-";
arguments.dex = "-";
arguments.parsley = "-";
argp_parse (&argp, argc, argv, 0, 0, &arguments);
struct printbuf* dex = printbuf_new();
struct printbuf* parsley = printbuf_new();
struct printbuf* incl = printbuf_new();
FILE* in = dex_fopen(arguments.dex, "r");
FILE* in = parsley_fopen(arguments.parsley, "r");
printbuf_file_read(in, dex);
printbuf_file_read(in, parsley);
while(elemptr->has_next) {
elemptr = elemptr->next;
FILE* f = dex_fopen(elemptr->string, "r");
FILE* f = parsley_fopen(elemptr->string, "r");
printbuf_file_read(f, incl);
fclose(f);
}
dexPtr compiled = dex_compile(dex->buf, incl->buf);
parsleyPtr compiled = parsley_compile(parsley->buf, incl->buf);
if(compiled->error != NULL) {
fprintf(stderr, "%s\n", compiled->error);
exit(1);
}
FILE* fo = dex_fopen(arguments.output_file, "w");
FILE* fo = parsley_fopen(arguments.output_file, "w");
fprintf(fo, compiled->raw_stylesheet);
fclose(fo);

View File

@ -1,5 +1,5 @@
#include <Python.h>
#include <dexter.h>
#include <parsley.h>
#include <libxslt/xslt.h>
#include <libexslt/exslt.h>
#include <libxslt/xsltInternals.h>
@ -8,7 +8,6 @@
#include <libxml/HTMLparser.h>
#include <libxml/HTMLtree.h>
#include <libxml/xmlwriter.h>
#include <dexter.h>
#include <string.h>
#include <stdio.h>
#include <json/json.h>
@ -16,12 +15,12 @@
typedef struct {
PyObject_HEAD
dexPtr dex;
} DexPy;
parsleyPtr parsley;
} PyParsley;
static PyTypeObject dexpy_DexPyType;
static PyTypeObject pyparsley_PyParsleyType;
static PyMethodDef dexpy_methods[] = {
static PyMethodDef pyparsley_methods[] = {
{NULL} /* Sentinel */
};
@ -32,7 +31,7 @@ static PyMethodDef dexpy_methods[] = {
static PyObject *jsonmodule;
PyMODINIT_FUNC
initdexpy(void)
initpyparsley(void)
{
jsonmodule = PyImport_ImportModule("json");
if(jsonmodule == NULL)
@ -40,34 +39,34 @@ initdexpy(void)
PyObject* m;
dexpy_DexPyType.tp_new = PyType_GenericNew;
if (PyType_Ready(&dexpy_DexPyType) < 0)
pyparsley_PyParsleyType.tp_new = PyType_GenericNew;
if (PyType_Ready(&pyparsley_PyParsleyType) < 0)
return;
m = Py_InitModule3("dexpy", dexpy_methods,
"Python binding for dexter");
m = Py_InitModule3("pyparsley", pyparsley_methods,
"Python binding for parsley");
Py_INCREF(&dexpy_DexPyType);
PyModule_AddObject(m, "DexPy", (PyObject *)&dexpy_DexPyType);
Py_INCREF(&pyparsley_PyParsleyType);
PyModule_AddObject(m, "PyParsley", (PyObject *)&pyparsley_PyParsleyType);
}
static void
DexPy_dealloc(DexPy* self)
PyParsley_dealloc(PyParsley* self)
{
if(self->dex != NULL) dex_free(self->dex);
if(self->parsley != NULL) parsley_free(self->parsley);
self->ob_type->tp_free((PyObject*)self);
}
static PyObject *
DexPy_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
PyParsley_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
DexPy *self;
self = (DexPy *)type->tp_alloc(type, 0);
PyParsley *self;
self = (PyParsley *)type->tp_alloc(type, 0);
return (PyObject *)self;
}
static int
DexPy_init(DexPy *self, PyObject *args, PyObject *kwds)
PyParsley_init(PyParsley *self, PyObject *args, PyObject *kwds)
{
PyObject *script;
char *string = "";
@ -89,10 +88,10 @@ DexPy_init(DexPy *self, PyObject *args, PyObject *kwds)
string = PyString_AsString(script);
if(string == NULL) return -1;
self->dex = dex_compile(string, incl);
self->parsley = parsley_compile(string, incl);
if(self->dex->error != NULL) {
PyErr_SetString(PyExc_RuntimeError, self->dex->error);
if(self->parsley->error != NULL) {
PyErr_SetString(PyExc_RuntimeError, self->parsley->error);
Py_DECREF(self);
return -1;
}
@ -117,7 +116,7 @@ pythonize_recurse(xmlNodePtr xml) {
PyDict_SetItemString(obj, child->name, pythonize_recurse(child->children));
child = child->next;
}
} else if(!strcmp(xml->ns->prefix, "dexter")) {
} else if(!strcmp(xml->ns->prefix, "parsley")) {
if(!strcmp(xml->name, "groups")) {
obj = PyList_New(0);
while(child != NULL) {
@ -125,7 +124,7 @@ pythonize_recurse(xmlNodePtr xml) {
child = child->next;
}
} else if(!strcmp(xml->name, "group")) {
// Implicitly handled by dexter:groups handler
// Implicitly handled by parsley:groups handler
}
}
break;
@ -141,11 +140,11 @@ pythonize_recurse(xmlNodePtr xml) {
}
static PyObject *
DexPy_parse_doc(parsedDexPtr ptr, char *type) {
PyParsley_parse_doc(parsedParsleyPtr ptr, char *type) {
if(ptr->error != NULL || ptr->xml == NULL) {
if(ptr->error == NULL) ptr->error = strdup("Unknown dex error");
if(ptr->error == NULL) ptr->error = strdup("Unknown parsley error");
PyErr_SetString(PyExc_RuntimeError, ptr->error);
parsed_dex_free(ptr);
parsed_parsley_free(ptr);
return NULL;
}
@ -167,19 +166,19 @@ DexPy_parse_doc(parsedDexPtr ptr, char *type) {
return Py_None;
}
}
parsed_dex_free(ptr);
parsed_parsley_free(ptr);
return output;
}
static PyObject *
DexPy_parse(DexPy *self, PyObject *args, PyObject *keywords)
PyParsley_parse(PyParsley *self, PyObject *args, PyObject *keywords)
{
char *file = NULL;
char *string = NULL;
char *input = "html";
char *output = "python";
int len;
parsedDexPtr ptr;
parsedParsleyPtr ptr;
static char * list[] = { "file", "string", "input", "output", NULL };
@ -188,41 +187,41 @@ DexPy_parse(DexPy *self, PyObject *args, PyObject *keywords)
return NULL;
}
if(self->dex == NULL) {
PyErr_SetString(PyExc_RuntimeError, "dex data is NULL");
if(self->parsley == NULL) {
PyErr_SetString(PyExc_RuntimeError, "parsley data is NULL");
return NULL;
}
if(file != NULL) {
ptr = dex_parse_file(self->dex, file, !strcmp(input, "html"));
ptr = parsley_parse_file(self->parsley, file, !strcmp(input, "html"));
} else {
ptr = dex_parse_string(self->dex, string, len, !strcmp(input, "html"));
ptr = parsley_parse_string(self->parsley, string, len, !strcmp(input, "html"));
}
return DexPy_parse_doc(ptr, output);
return PyParsley_parse_doc(ptr, output);
}
static PyMethodDef DexPy_methods[] = {
{"parse", (PyCFunction)DexPy_parse, METH_VARARGS | METH_KEYWORDS,
static PyMethodDef PyParsley_methods[] = {
{"parse", (PyCFunction)PyParsley_parse, METH_VARARGS | METH_KEYWORDS,
"Parses with a variety of options"
},
// {"parse_string", (PyCFunction)DexPy_parse_string, METH_VARARGS,
// "Parses an in-memory string with the current dex"
// {"parse_string", (PyCFunction)PyParsley_parse_string, METH_VARARGS,
// "Parses an in-memory string with the current parslet"
// },
// {"parse_file", (PyCFunction)DexPy_parse_file, METH_VARARGS,
// "Parses file or url with the current dex"
// {"parse_file", (PyCFunction)PyParsley_parse_file, METH_VARARGS,
// "Parses file or url with the current parslet"
// },
{NULL} /* Sentinel */
};
static PyTypeObject dexpy_DexPyType = {
static PyTypeObject pyparsley_PyParsleyType = {
PyObject_HEAD_INIT(NULL)
0, /*ob_size*/
"dexpy.DexPy", /*tp_name*/
sizeof(DexPy), /*tp_basicsize*/
"pyparsley.PyParsley", /*tp_name*/
sizeof(PyParsley), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor) DexPy_dealloc, /*tp_dealloc*/
(destructor) PyParsley_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
@ -238,14 +237,14 @@ static PyTypeObject dexpy_DexPyType = {
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
"DexPy objects", /* tp_doc */
"PyParsley objects", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
DexPy_methods, /* tp_methods */
PyParsley_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
@ -253,7 +252,7 @@ static PyTypeObject dexpy_DexPyType = {
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)DexPy_init, /* tp_init */
(initproc)PyParsley_init, /* tp_init */
0, /* tp_alloc */
DexPy_new, /* tp_new */
PyParsley_new, /* tp_new */
};

View File

@ -6,7 +6,7 @@ from distutils.core import setup, Extension
from subprocess import Popen, PIPE
white = re.compile(r'\s+')
flags = "-ljson -ldexter -I/usr/include -I/usr/local/include -I/opt/local/include -L/usr/lib -L/usr/local/lib -L/opt/local/lib "
flags = "-ljson -lparsley -I/usr/include -I/usr/local/include -I/opt/local/include -L/usr/lib -L/usr/local/lib -L/opt/local/lib "
flags += Popen(["xml2-config", "--libs", "--cflags"], stdout=PIPE).communicate()[0]
flags += Popen(["xslt-config", "--libs", "--cflags"], stdout=PIPE).communicate()[0]
flags = white.sub(" ", flags)
@ -15,8 +15,8 @@ libraries = re.findall(r'-l(\S+)', flags)
include_dirs = re.findall(r'-I(\S+)', flags)
lib_dirs = re.findall(r'-L(\S+)', flags)
setup(name="dexpy", version="1.0",
ext_modules=[Extension("dexpy", ["dexpymodule.c"],
setup(name="pyparsley", version="1.0",
ext_modules=[Extension("pyparsley", ["pyparsleymodule.c"],
library_dirs = lib_dirs,
include_dirs = include_dirs,
libraries = libraries

View File

@ -1,29 +1,29 @@
#!/usr/bin/env python2.6
import unittest
from dexpy import DexPy
from pyparsley import PyParsley
from inspect import currentframe
from os.path import dirname
class TestDexPy(unittest.TestCase):
class TestPyParsley(unittest.TestCase):
def setUp(self):
self.dex = DexPy({'title': 'title'})
self.alt_dex = DexPy('{"title": "title"}')
self.parsley = PyParsley({'title': 'title'})
self.alt_parsley = PyParsley('{"title": "title"}')
self.__file__ = currentframe().f_code.co_filename
self.__dir__ = dirname(self.__file__)
self.file = self.__dir__ + '/../../test/yelp.html'
self.json = '{ "title": "\\t\\tNick\'s Crispy Tacos - Russian Hill - San Francisco, CA\\n" }'
def test_file_xml(self):
parsed = self.dex.parse(file = self.file, output = "json")
parsed = self.parsley.parse(file = self.file, output = "json")
self.assertEquals(self.json, parsed)
def test_json_file_xml(self):
parsed = self.alt_dex.parse(file = self.file, output = "json")
parsed = self.alt_parsley.parse(file = self.file, output = "json")
self.assertEquals(self.json, parsed)
def test_native(self):
parsed = self.alt_dex.parse(file = self.file, output = "python")
parsed = self.alt_parsley.parse(file = self.file, output = "python")
self.assertEquals({ "title": "\t\tNick's Crispy Tacos - Russian Hill - San Francisco, CA\n" }, parsed)
if __name__ == '__main__':

View File

@ -17,6 +17,6 @@ task :configure do
end
task :install do
system "gem build dexterous.gemspec"
system "gem install dexterous"
system "gem build parsley-ruby.gemspec"
system "gem install parsley-ruby"
end

View File

@ -8,56 +8,56 @@
#include <libxml/HTMLparser.h>
#include <libxml/HTMLtree.h>
#include <libxml/xmlwriter.h>
#include <dexter.h>
#include <parsley.h>
#include <json/json.h>
#include <xml2json.h>
VALUE _new(VALUE, VALUE, VALUE);
VALUE _parse_file(VALUE, VALUE, VALUE, VALUE);
VALUE _parse_string(VALUE, VALUE, VALUE, VALUE);
VALUE _parse_doc(parsedDexPtr, VALUE);
VALUE _parse_doc(parsedParsleyPtr, VALUE);
VALUE rubify_recurse(xmlNodePtr xml);
VALUE c_dex_err;
VALUE c_dex;
VALUE c_parsley_err;
VALUE c_parsley;
void Init_cdexter()
void Init_cparsley()
{
c_dex = rb_define_class("CDexter", rb_cObject);
c_dex_err = rb_define_class("DexError", rb_eRuntimeError);
rb_define_singleton_method(c_dex, "new", _new, 2);
rb_define_method(c_dex, "parse_file", _parse_file, 3);
rb_define_method(c_dex, "parse_string", _parse_string, 3);
c_parsley = rb_define_class("CParsley", rb_cObject);
c_parsley_err = rb_define_class("ParsleyError", rb_eRuntimeError);
rb_define_singleton_method(c_parsley, "new", _new, 2);
rb_define_method(c_parsley, "parse_file", _parse_file, 3);
rb_define_method(c_parsley, "parse_string", _parse_string, 3);
}
VALUE _new(VALUE self, VALUE dex, VALUE incl){
dexPtr ptr = dex_compile(STR2CSTR(dex), STR2CSTR(incl));
VALUE _new(VALUE self, VALUE parsley, VALUE incl){
parsleyPtr ptr = parsley_compile(STR2CSTR(parsley), STR2CSTR(incl));
if(ptr->error != NULL) {
rb_raise(c_dex_err, ptr->error);
dex_free(ptr);
rb_raise(c_parsley_err, ptr->error);
parsley_free(ptr);
return Qnil;
}
return Data_Wrap_Struct(c_dex, 0, dex_free, ptr);
return Data_Wrap_Struct(c_parsley, 0, parsley_free, ptr);
}
VALUE _parse_file(VALUE self, VALUE name, VALUE input, VALUE output){
dexPtr dex;
Data_Get_Struct(self, dexPtr, dex);
return _parse_doc(dex_parse_file(dex, STR2CSTR(name), input == ID2SYM(rb_intern("html"))), output);
parsleyPtr parsley;
Data_Get_Struct(self, parsleyPtr, parsley);
return _parse_doc(parsley_parse_file(parsley, STR2CSTR(name), input == ID2SYM(rb_intern("html"))), output);
}
VALUE _parse_string(VALUE self, VALUE string, VALUE input, VALUE output) {
dexPtr dex;
Data_Get_Struct(self, dexPtr, dex);
parsleyPtr parsley;
Data_Get_Struct(self, parsleyPtr, parsley);
char* cstr = STR2CSTR(string);
return _parse_doc(dex_parse_string(dex, cstr, strlen(cstr), input == ID2SYM(rb_intern("html"))), output);
return _parse_doc(parsley_parse_string(parsley, cstr, strlen(cstr), input == ID2SYM(rb_intern("html"))), output);
}
VALUE _parse_doc(parsedDexPtr ptr, VALUE type) {
VALUE _parse_doc(parsedParsleyPtr ptr, VALUE type) {
if(ptr->error != NULL || ptr->xml == NULL) {
if(ptr->error == NULL) ptr->error = strdup("Unknown dex error");
rb_raise(c_dex_err, ptr->error);
parsed_dex_free(ptr);
if(ptr->error == NULL) ptr->error = strdup("Unknown parsley error");
rb_raise(c_parsley_err, ptr->error);
parsed_parsley_free(ptr);
return Qnil;
}
@ -77,7 +77,7 @@ VALUE _parse_doc(parsedDexPtr ptr, VALUE type) {
if(output == NULL) output = Qnil;
}
parsed_dex_free(ptr);
parsed_parsley_free(ptr);
return output;
}
@ -97,7 +97,7 @@ VALUE rubify_recurse(xmlNodePtr xml) {
rb_hash_aset(obj, rb_str_new2(child->name), rubify_recurse(child->children));
child = child->next;
}
} else if(!strcmp(xml->ns->prefix, "dexter")) {
} else if(!strcmp(xml->ns->prefix, "parsley")) {
if(!strcmp(xml->name, "groups")) {
obj = rb_ary_new();
while(child != NULL) {
@ -105,7 +105,7 @@ VALUE rubify_recurse(xmlNodePtr xml) {
child = child->next;
}
} else if(!strcmp(xml->name, "group")) {
// Implicitly handled by dexter:groups handler
// Implicitly handled by parsley:groups handler
}
}
break;

View File

@ -61,7 +61,7 @@ mylib = %w[/usr/local/lib /opt/local/lib /usr/lib]
find_header('json/json.h', INCLUDEDIR, *myincl) or abort "need json/json.h"
find_library('json', 'json_object_new_string', LIBDIR, *mylib) or abort "need libjson"
find_header('dexter.h', INCLUDEDIR, *myincl) or abort "need dexter.h"
find_library('dexter', 'dex_compile', LIBDIR, *mylib) or abort "need libdexter"
find_header('parsley.h', INCLUDEDIR, *myincl) or abort "need parsley.h"
find_library('parsley', 'parsley_compile', LIBDIR, *mylib) or abort "need libparsley"
create_makefile('cdexter')
create_makefile('cparsley')

View File

@ -1,16 +1,16 @@
require File.dirname(__FILE__) + "/../ext/cdexter"
require File.dirname(__FILE__) + "/../ext/cparsley"
require "rubygems"
require "json"
require "thread"
class Dexterous
def initialize(dex, incl = "")
if(dex.is_a?(Hash))
dex = dex.to_json
class Parsley
def initialize(parsley, incl = "")
if(parsley.is_a?(Hash))
parsley = parsley.to_json
end
@@mutex ||= Mutex.new
@@mutex.synchronize do
@dex = CDexter.new(dex, incl)
@parsley = CParsley.new(parsley, incl)
end
end
@ -31,9 +31,9 @@ class Dexterous
options[:input] ||= :html
options[:output]||= :ruby
if options[:file]
@dex.parse_file options[:file], options[:input], options[:output]
@parsley.parse_file options[:file], options[:input], options[:output]
else
@dex.parse_string options[:string], options[:input], options[:output]
@parsley.parse_string options[:string], options[:input], options[:output]
end
end
end

View File

@ -1,11 +1,11 @@
Gem::Specification.new do |s|
s.name = "dexterous"
s.name = "parsley-ruby"
s.version = "0.1.0"
s.date = "2008-08-10"
s.summary = "Ruby binding for dexter"
s.summary = "Ruby binding for parsley"
s.email = "kyle@kylemaxwell.com"
s.homepage = "http://github.com/fizx/robots"
s.description = "Ruby binding for dexter"
s.homepage = "http://github.com/fizx/parsley-ruby"
s.description = "Ruby binding for parsley"
s.has_rdoc = true
s.require_paths = ["lib", "ext"]
s.extensions = "ext/extconf.rb"

View File

@ -1,52 +1,52 @@
require "test/unit"
require File.dirname(__FILE__) + "/../lib/dexterous"
require File.dirname(__FILE__) + "/../lib/parsley"
class TestDexterous < Test::Unit::TestCase
class TestParsley < Test::Unit::TestCase
def setup
@file = File.dirname(__FILE__) + "/../../test/yelp.html"
end
def test_yelp
@dex = Dexterous.new(File.read(File.dirname(__FILE__) + "/../../test/yelp-home.dex"))
out = @dex.parse(:file => File.dirname(__FILE__) + "/../../test/yelp-home.html")
@parsley = Parsley.new(File.read(File.dirname(__FILE__) + "/../../test/yelp-home.let"))
out = @parsley.parse(:file => File.dirname(__FILE__) + "/../../test/yelp-home.html")
assert_equal "/c/sf/shopping", out["categories"][0]["href"]
end
def test_yelp_xml
@dex = Dexterous.new(File.read(File.dirname(__FILE__) + "/../../test/yelp-home.dex"))
out = @dex.parse(:file => File.dirname(__FILE__) + "/../../test/yelp-home.html", :output => :xml)
@parsley = Parsley.new(File.read(File.dirname(__FILE__) + "/../../test/yelp-home.let"))
out = @parsley.parse(:file => File.dirname(__FILE__) + "/../../test/yelp-home.html", :output => :xml)
end
def test_simple
@dex = Dexterous.new("hi" => "h1")
assert_equal({"hi" => "Nick's Crispy Tacos"}, @dex.parse(:file => @file))
@parsley = Parsley.new("hi" => "h1")
assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:file => @file))
end
def test_simple_string
@dex = Dexterous.new("hi" => "h1")
assert_equal({"hi" => "Nick's Crispy Tacos"}, @dex.parse(:string => "<html><body><h1>Nick's Crispy Tacos</h1></body></html>"))
@parsley = Parsley.new("hi" => "h1")
assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:string => "<html><body><h1>Nick's Crispy Tacos</h1></body></html>"))
end
def test_xml
@dex = Dexterous.new("hi" => "h1")
xml = "<?xml version=\"1.0\"?>\n<dexter:root xmlns:dexter=\"http://kylemaxwell.com/dexter\"><hi>Nick's Crispy Tacos</hi></dexter:root>\n"
assert_equal(xml, @dex.parse(:file => @file, :output => :xml))
@parsley = Parsley.new("hi" => "h1")
xml = "<?xml version=\"1.0\"?>\n<parsley:root xmlns:parsley=\"http://parslets.com/json\"><hi>Nick's Crispy Tacos</hi></parsley:root>\n"
assert_equal(xml, @parsley.parse(:file => @file, :output => :xml))
end
def test_json
@dex = Dexterous.new("hi" => "h1")
assert_equal('{ "hi": "Nick\'s Crispy Tacos" }', @dex.parse(:file => @file, :output => :json))
@parsley = Parsley.new("hi" => "h1")
assert_equal('{ "hi": "Nick\'s Crispy Tacos" }', @parsley.parse(:file => @file, :output => :json))
end
def test_rescuable_file_error
@dex = Dexterous.new("hi" => "h1")
@parsley = Parsley.new("hi" => "h1")
@nonexistant_file = File.dirname(__FILE__) + "/../fixtures/yelp.html"
assert_equal({"hi" => "Nick's Crispy Tacos"}, @dex.parse(:file => @nonexistant_file)) rescue nil
assert_equal({"hi" => "Nick's Crispy Tacos"}, @parsley.parse(:file => @nonexistant_file)) rescue nil
end
def test_array_string
@dex = Dexterous.new({"foo" => ["li"]})
out = @dex.parse(:file => @file)
@parsley = Parsley.new({"foo" => ["li"]})
out = @parsley.parse(:file => @file)
assert_kind_of Hash, out
assert_kind_of Array, out["foo"], out.inspect
assert out["foo"].length > 1

View File

@ -1,7 +1,7 @@
require "rubygems"
require "nokogiri"
require "hpricot"
require "dexterous"
require "parsley"
require "benchmark"
require "pp"
@ -29,8 +29,8 @@ def parse(doc)
end
end
def dext
dex = Dexterous.new({
def pars
parslet = Parsley.new({
"name" => "h1",
"phone" => "#bizPhone",
"address" => "address",
@ -42,12 +42,12 @@ def dext
}
]
})
pp dex.parse(:file => YELP_HTML)
pp parslet.parse(:file => YELP_HTML)
end
Benchmark.bm do |x|
x.report("nokogiri: ") { 3.times { noko } }
x.report("hpricot: ") { 3.times { hpri } }
x.report("dexterous: ") { 3.times { dext } }
x.report("parsley: ") { 3.times { pars } }
end

32
util.c
View File

@ -1,13 +1,13 @@
#include "util.h"
#include "dexter.h"
#include "parsley.h"
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <stdbool.h>
static bool dex_exslt_registered = false;
static bool parsley_exslt_registered = false;
FILE* dex_fopen(char* name, char* mode) {
FILE* parsley_fopen(char* name, char* mode) {
FILE* fo;
if(!strcmp("-", name)) {
if(!strcmp("w", mode)) {
@ -27,16 +27,16 @@ FILE* dex_fopen(char* name, char* mode) {
void registerEXSLT() {
if(!dex_exslt_registered) {
if(!parsley_exslt_registered) {
exsltRegisterAll();
dex_register_all();
parsley_register_all();
init_xpath_alias();
exslt_org_regular_expressions_init();
dex_exslt_registered = true;
parsley_exslt_registered = true;
}
}
int dex_key_flags(char* key) {
int parsley_key_flags(char* key) {
char* ptr = key;
char* last_alnum = key;
char* last_paren = key;
@ -52,14 +52,14 @@ int dex_key_flags(char* key) {
while(*ptr++ != '\0'){
switch(*ptr){
case '?':
flags |= DEX_OPTIONAL;
flags |= PARSLEY_OPTIONAL;
break;
}
}
return flags;
}
char* dex_key_tag(char* key) {
char* parsley_key_tag(char* key) {
char *tag = astrdup(key);
char *ptr = tag;
while(*ptr++ != '\0'){
@ -71,7 +71,7 @@ char* dex_key_tag(char* key) {
return tag;
}
char* dex_key_filter(char* key) {
char* parsley_key_filter(char* key) {
char *expr = astrdup(key);
char *ptr = expr;
char *last_paren;
@ -93,26 +93,26 @@ char* dex_key_filter(char* key) {
char* sprintbuf_dex_header(struct printbuf *buf) {
char* sprintbuf_parsley_header(struct printbuf *buf) {
sprintbuf(buf, "<xsl:stylesheet version=\"1.0\" xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\"");
sprintbuf(buf, " xmlns:dex=\"http://kylemaxwell.com/dexter/library\"");
sprintbuf(buf, " xmlns:dexter=\"http://kylemaxwell.com/dexter\"");
sprintbuf(buf, " xmlns:lib=\"http://parslets.com/stdlib\"");
sprintbuf(buf, " xmlns:parslet=\"http://parslets.com/json\"");
sprintbuf(buf, " xmlns:str=\"http://exslt.org/strings\"");
sprintbuf(buf, " xmlns:set=\"http://exslt.org/sets\"");
sprintbuf(buf, " xmlns:math=\"http://exslt.org/math\"");
sprintbuf(buf, " xmlns:func=\"http://exslt.org/functions\"");
sprintbuf(buf, " xmlns:user=\"http://kylemaxwell.com/dexter/user-functions\"");
sprintbuf(buf, " xmlns:user=\"http://parslets.com/usre\"");
sprintbuf(buf, " xmlns:dyn=\"http://exslt.org/dynamic\"");
sprintbuf(buf, " xmlns:date=\"http://exslt.org/dates-and-times\"");
sprintbuf(buf, " xmlns:exsl=\"http://exslt.org/common\"");
sprintbuf(buf, " xmlns:saxon=\"http://icl.com/saxon\"");
sprintbuf(buf, " xmlns:regexp=\"http://exslt.org/regular-expressions\"");
sprintbuf(buf, " xmlns:regex=\"http://exslt.org/regular-expressions\"");
sprintbuf(buf, " extension-element-prefixes=\"dex str math set func dyn exsl saxon user date regexp regex\"");
sprintbuf(buf, " extension-element-prefixes=\"lib str math set func dyn exsl saxon user date regexp regex\"");
sprintbuf(buf, ">\n");
sprintbuf(buf, "<xsl:output method=\"xml\" indent=\"yes\"/>\n");
sprintbuf(buf, "<xsl:strip-space elements=\"*\"/>\n");
sprintbuf(buf, "<func:function name=\"dex:nl\"><xsl:param name=\"in\" select=\".\"/>");
sprintbuf(buf, "<func:function name=\"lib:nl\"><xsl:param name=\"in\" select=\".\"/>");
sprintbuf(buf, "<xsl:variable name=\"out\"><xsl:apply-templates mode=\"innertext\" select=\"exsl:node-set($in)\"/></xsl:variable>");
sprintbuf(buf, "<func:result select=\"$out\" /></func:function>");
sprintbuf(buf, "<xsl:template match=\"text()\" mode=\"innertext\"><xsl:value-of select=\".\" /></xsl:template>");

10
util.h
View File

@ -4,13 +4,13 @@
#include <stdio.h>
#include <json/json.h>
FILE* dex_fopen(char*, char*);
char* sprintbuf_dex_header(struct printbuf *);
FILE* parsley_fopen(char*, char*);
char* sprintbuf_parsley_header(struct printbuf *);
void registerEXSLT();
int dex_key_flags(char*);
char* dex_key_tag(char*);
char* dex_key_filter(char*);
int parsley_key_flags(char*);
char* parsley_key_tag(char*);
char* parsley_key_filter(char*);
#endif

View File

@ -23,7 +23,7 @@ static struct json_object * _xml2json(xmlNodePtr xml) {
json_object_object_add(json, child->name, xml2json(child->children));
child = child->next;
}
} else if(!strcmp(xml->ns->prefix, "dexter")) {
} else if(!strcmp(xml->ns->prefix, "parsley")) {
if(!strcmp(xml->name, "zipped")) {
int len = 0;
xmlNodePtr ptr = xml->children;
@ -66,7 +66,7 @@ static struct json_object * _xml2json(xmlNodePtr xml) {
child = child->next;
}
} else if(!strcmp(xml->name, "group")) {
// Implicitly handled by dexter:groups handler
// Implicitly handled by parsley:groups handler
}
}
break;