pyparsley/parsley.c

420 lines
13 KiB
C

#include <stdio.h>
#include <stdlib.h>
#include <argp.h>
#include <stdarg.h>
#include <json/json.h>
#include "kstring.h"
#include "parsley.h"
#include "y.tab.h"
#include "printbuf.h"
#include "functions.h"
#include "util.h"
#include <string.h>
#include <errno.h>
#include <ctype.h>
#include <stdbool.h>
#include <libxslt/xslt.h>
#include <libxslt/xsltInternals.h>
#include <libxslt/transform.h>
#include <libxml/parser.h>
#include <libxml/HTMLparser.h>
#include <libxml/HTMLtree.h>
#include <libxml/xmlwriter.h>
#include <libexslt/exslt.h>
int yywrap(void){
return 1;
}
void parsed_parsley_free(parsedParsleyPtr ptr) {
if(ptr->xml != NULL) xmlFree(ptr->xml);
if(ptr->error != NULL) free(ptr->error);
free(ptr);
}
static parsedParsleyPtr parse_error(char* format, ...) {
parsedParsleyPtr ptr = (parsedParsleyPtr) calloc(sizeof(parsed_parsley), 1);
ptr->xml = NULL;
va_list args;
va_start(args, format);
vasprintf(&ptr->error, format, args);
va_end(args);
return ptr;
}
parsedParsleyPtr parsley_parse_file(parsleyPtr parsley, char* file, bool html) {
if(html) {
htmlParserCtxtPtr htmlCtxt = htmlNewParserCtxt();
htmlDocPtr html = htmlCtxtReadFile(htmlCtxt, file, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
htmlFreeParserCtxt(htmlCtxt);
if(html == NULL) return parse_error("Couldn't parse file: %s\n", file);
return parsley_parse_doc(parsley, html);
} else {
xmlParserCtxtPtr ctxt = xmlNewParserCtxt();
xmlDocPtr xml = xmlCtxtReadFile(ctxt, file, NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
xmlFreeParserCtxt(ctxt);
if(xml == NULL) return parse_error("Couldn't parse file: %s\n", file);
return parsley_parse_doc(parsley, xml);
}
}
parsedParsleyPtr parsley_parse_string(parsleyPtr parsley, char* string, size_t size, bool html) {
if(html) {
htmlParserCtxtPtr htmlCtxt = htmlNewParserCtxt();
htmlDocPtr html = htmlCtxtReadMemory(htmlCtxt, string, size, "http://parslets.com/in-memory-string", NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
if(html == NULL) return parse_error("Couldn't parse string");
return parsley_parse_doc(parsley, html);
} else {
xmlParserCtxtPtr ctxt = xmlNewParserCtxt();
xmlDocPtr xml = xmlCtxtReadMemory(ctxt, string, size, "http://parslets.com/in-memory-string", NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |HTML_PARSE_NOWARNING);
if(xml == NULL) return parse_error("Couldn't parse string");
return parsley_parse_doc(parsley, xml);
}
}
static char *
xpath_of(xmlNodePtr node) {
char *out = NULL;
while(node->name != NULL && node->parent != NULL) {
out = out == NULL ? node->name : astrcat3(node->name, "/", out);
node = node->parent;
}
return astrcat("/", out);
}
static void
unlink(xmlNodePtr xml) {
if(xml == NULL || xml->parent == NULL) return;
xmlNodePtr sibling = xml->parent->children;
if(sibling == xml) {
xml->parent->children = xml->next;
return;
}
while(sibling != NULL) {
if(sibling == xml) {
xml->prev->next = xml->next;
if(xml->next) xml->next->prev = xml->prev;
}
sibling = sibling->next;
}
}
static bool
is_root(xmlElementPtr xml) {
return xml != NULL && xml->name != NULL && xml->prefix !=NULL && !strcmp(xml->name, "root") && !strcmp(xml->prefix, "parsley");
}
static void
prune(parsedParsleyPtr ptr, xmlNodePtr xml, char* err) {
if(xml == NULL) return;
bool optional = ((xmlElementPtr )xml)->attributes != NULL;
if(optional) {
unlink(xml);
visit(ptr, xml->parent, true);
return;
} else {
if(err == NULL) asprintf(&err, "%s was empty", xpath_of(xml));
if(xml->parent != xml->doc->children) {
prune(ptr, xml->parent, err);
} else {
ptr->error = err;
}
}
}
static void
visit(parsedParsleyPtr ptr, xmlNodePtr xml, bool bubbling) {
if(xml->type != XML_ELEMENT_NODE) return;
xmlNodePtr child = xml->children;
xmlNodePtr parent = xml->parent;
if(parent == NULL) return;
if(xml_empty(xml)) prune(ptr, xml, NULL);
while(!bubbling && child != NULL){
visit(ptr, child, bubbling);
child = child->next;
}
}
static bool
xml_empty(xmlNodePtr xml) {
xmlNodePtr child = xml->children;
while(child != NULL) {
if(child->type != XML_TEXT_NODE) return false;
if(strlen(child->content)) return false;
child = child->next;
}
return true;
}
parsedParsleyPtr parsley_parse_doc(parsleyPtr parsley, xmlDocPtr doc) {
parsedParsleyPtr ptr = (parsedParsleyPtr) calloc(sizeof(parsed_parsley), 1);
ptr->parsley = parsley;
ptr->xml = xsltApplyStylesheet(parsley->stylesheet, doc, NULL);
if(ptr->xml != NULL && ptr->error == NULL) visit(ptr, ptr->xml->children, false);
if(ptr->xml == NULL && ptr->error == NULL) { // == NULL
ptr->error = strdup("Internal runtime error");
}
return ptr;
}
parsleyPtr parsley_compile(char* parsley_str, char* incl) {
parsleyPtr parsley = (parsleyPtr) calloc(sizeof(compiled_parsley), 1);
if(last_parsley_error != NULL) {
free(last_parsley_error);
last_parsley_error = NULL;
}
registerEXSLT();
struct json_object *json = json_tokener_parse(parsley_str);
if(is_error(json)) {
parsley->error = strdup("Your parslet is not valid json.");
// json_object_put(json); // frees json
return parsley;
}
struct printbuf* buf = printbuf_new();
sprintbuf_parsley_header(buf);
sprintbuf(buf, "%s\n", incl);
sprintbuf(buf, "<xsl:template match=\"/\">\n");
sprintbuf(buf, "<parsley:root>\n");
contextPtr context = new_context(json, buf);
__parsley_recurse(context);
json_object_put(json); // frees json
parsley->error = last_parsley_error;
sprintbuf(buf, "</parsley:root>\n");
sprintbuf(buf, "</xsl:template>\n");
sprintbuf(buf, context->key_buf->buf);
sprintbuf(buf, "</xsl:stylesheet>\n");
if(parsley->error == NULL) {
xmlParserCtxtPtr ctxt = xmlNewParserCtxt();
xmlDocPtr doc = xmlCtxtReadMemory(ctxt, buf->buf, buf->size, "http://parslets.com/compiled", NULL, 3);
xmlFreeParserCtxt(ctxt);
parsley->raw_stylesheet = strdup(buf->buf);
parsley->stylesheet = xsltParseStylesheetDoc(doc);
}
printbuf_free(buf);
parsley_collect();
return parsley;
}
static contextPtr new_context(struct json_object * json, struct printbuf *buf) {
contextPtr c = parsley_alloc(sizeof(parsley_context));
c->key_buf = printbuf_new();
sprintbuf(c->key_buf, "");
c->name = "root";
c->tag = "root";
c->full_expr = "/";
c->expr = NULL;
c->magic = NULL;
c->filter = NULL;
c->buf = buf;
c->json = json;
c->parent = NULL;
c->array = 0;
c->string = 0;
c->flags = 0;
c->keys = NULL;
c->zipped = 0;
return c;
}
contextPtr deeper_context(contextPtr context, char* key, struct json_object * val) {
contextPtr c = parsley_alloc(sizeof(parsley_context));
c->key_buf = context->key_buf;
c->keys = context->keys;
c->tag = parsley_key_tag(key);
c->flags = parsley_key_flags(key);
c->name = astrcat3(context->name, ".", c->tag);
parsley_parsing_context = c;
c->array = val != NULL && json_object_is_type(val, json_type_array);
c->json = c->array ? json_object_array_get_idx(val, 0) : val;
c->string = val != NULL && json_object_is_type(c->json, json_type_string);
c->filter = parsley_key_filter(key);
c->magic = ((c->filter == NULL) && c->array && !(c->string)) ? c->name : context->magic;
if(context->filter != NULL && !c->array) c->magic = NULL;
c->buf = context->buf;
c->raw_expr = c->string ? myparse(astrdup(json_object_get_string(c->json))) : NULL;
c->full_expr = full_expr(context, c->filter);
c->full_expr = full_expr(c, c->raw_expr);
c->expr = filter_intersection(c->magic, c->raw_expr);
c->filter = filter_intersection(c->magic, c->filter);
c->parent = context;
return c;
}
static char* filter_intersection(char* key, char* expr) {
if(key != NULL && expr != NULL) {
return astrcat7("set:intersection(key('", key, "__key', $", key, "__index), ", expr, ")");
} else {
return expr;
}
}
void parsley_free(parsleyPtr ptr) {
if(ptr->error != NULL)
free(ptr->error);
if(ptr->raw_stylesheet != NULL)
free(ptr->raw_stylesheet);
if(ptr->stylesheet != NULL)
xsltFreeStylesheet(ptr->stylesheet);
free(ptr);
}
void yyerror(const char * s) {
struct printbuf *buf = printbuf_new();
if(last_parsley_error !=NULL) sprintbuf(buf, "%s\n", last_parsley_error);
sprintbuf(buf, "%s in key: %s", s, parsley_parsing_context->name);
last_parsley_error = strdup(buf->buf);
printbuf_free(buf);
}
static char* optional(contextPtr c) {
return (c->flags & PARSLEY_OPTIONAL) ? " optional=\"true\"" : "";
}
static bool
all_strings(struct json_object * json) {
json_object_object_foreach(json, key, val) {
if(val == NULL || !json_object_is_type(val, json_type_string)) return false;
}
return true;
}
void __parsley_recurse(contextPtr context) {
// printf("a\n");
char* tmp;
struct printbuf * buf;
keyPtr keys;
contextPtr c;
if(context->json == NULL) return;
json_object_object_foreach(context->json, key, val) {
c = deeper_context(context, key, val);
sprintbuf(c->buf, "<%s%s>\n", c->tag, optional(c));
if(c->string) {
if(c->array || context->zipped) {
if(c->filter){
// printf("b\n");
sprintbuf(c->buf, "<parsley:groups optional=\"true\"><xsl:for-each select=\"%s\"><parsley:group optional=\"true\">\n", c->filter);
sprintbuf(c->buf, "<xsl:value-of select=\"%s\" />\n", c->raw_expr);
sprintbuf(c->buf, "</parsley:group></xsl:for-each></parsley:groups>\n");
} else {
// printf("c\n");
sprintbuf(c->buf, "<parsley:groups optional=\"true\"><xsl:for-each select=\"%s\"><parsley:group optional=\"true\">\n", c->expr);
sprintbuf(c->buf, "<xsl:value-of select=\".\" />\n");
sprintbuf(c->buf, "</parsley:group></xsl:for-each></parsley:groups>\n");
}
} else {
if(c->filter){
// printf("d\n");
sprintbuf(c->buf, "<xsl:for-each select=\"%s\"><xsl:if test=\"position()=1\">\n", c->filter);
sprintbuf(c->buf, "<xsl:value-of select=\"%s\" />\n", c->raw_expr);
sprintbuf(c->buf, "</xsl:if></xsl:for-each>\n");
} else {
// printf("e\n");
sprintbuf(c->buf, "<xsl:value-of select=\"%s\" />\n", c->expr);
}
}
} else { // if c->object !string
if(c->array) { // scoped
if(c->filter != NULL) {
// printf("f\n");
sprintbuf(c->buf, "<parsley:groups optional=\"true\"><xsl:for-each select=\"%s\"><parsley:group optional=\"true\">\n", c->filter);
__parsley_recurse(c);
sprintbuf(c->buf, "</parsley:group></xsl:for-each></parsley:groups>\n");
} else { // magic
if(all_strings(c->json)) {
c->magic = NULL;
c->zipped = 1;
sprintbuf(c->buf, "<parsley:zipped>\n");
__parsley_recurse(c);
sprintbuf(c->buf, "</parsley:zipped>\n");
} else {
// printf("h\n");
sprintbuf(c->buf, "<xsl:variable name=\"%s__context\" select=\".\"/>\n", c->name);
parsley_parsing_context = c;
char * str = inner_key_of(c->json);
if(str != NULL) {
// printf("i\n");
tmp = myparse(astrdup(str));
sprintbuf(c->buf, "<parsley:groups optional=\"true\"><xsl:for-each select=\"%s\">\n", filter_intersection(context->magic, tmp));
// keys
keys = parsley_alloc(sizeof(key_node));
keys->name = c->name;
keys->use = full_expr(c, tmp);
keys->next = c->keys;
c->keys = keys;
buf = printbuf_new();
sprintbuf(buf, "concat(");
while(keys != NULL){
sprintbuf(buf, "count(set:intersection(following::*, %s)), '-',", keys->use);
keys = keys->next;
}
sprintbuf(buf, "'')");
tmp = astrdup(buf->buf);
printbuf_free(buf);
sprintbuf(c->key_buf, "<xsl:key name=\"%s__key\" match=\"%s\" use=\"%s\"/>\n", c->name,
full_expr(c, "./descendant-or-self::*"),
tmp
);
sprintbuf(c->buf, "<xsl:variable name=\"%s__index\" select=\"%s\"/>\n", c->name, tmp);
sprintbuf(c->buf, "<xsl:for-each select=\"$%s__context\"><parsley:group optional=\"true\">\n", c->name);
__parsley_recurse(c);
sprintbuf(c->buf, "</parsley:group></xsl:for-each></xsl:for-each></parsley:groups>\n");
}
}
}
} else {
// printf("j\n");
if(c->filter == NULL) {
__parsley_recurse(c);
} else {
// printf("k\n");
sprintbuf(c->buf, "<xsl:for-each select=\"%s\"><xsl:if test=\"position() = 1\">\n", c->filter);
__parsley_recurse(c);
sprintbuf(c->buf, "</xsl:if></xsl:for-each>\n");
}
}
}
sprintbuf(c->buf, "</%s>\n", c->tag);
}
}
static char* full_expr(contextPtr context, char* expr) {
if(expr == NULL) return context->full_expr;
char* merged = arepl(expr, ".", context->full_expr);
return arepl(merged, "///", "//");
}
static char* inner_key_of(struct json_object * json) {
switch(json_object_get_type(json)) {
case json_type_string:
return json_object_get_string(json);
case json_type_array:
return NULL;
case json_type_object:
return inner_key_each(json);
}
}
static char* inner_key_each(struct json_object * json) {
json_object_object_foreach(json, key, val) {
char* inner = inner_key_of(val);
if(inner != NULL) return inner;
}
return NULL;
}