c++ - how to get rid of escape character in a token with spirit::lex? -
i want tokenize own extension of sql syntax. involves recognizing escaped double quote inside double quoted string. e.g. in mysql these 2 string tokens equivalent: """"
(the second double quote acts escape character) , '"'
. have tried different things stuck @ how replace token's value.
#include <boost/spirit/include/lex_lexertl.hpp> namespace lex = boost::spirit::lex; template <typename lexer> struct sql_tokens : lex::lexer<lexer> { sql_tokens() { string_quote_double = "\\\""; // '"' this->self("initial") = string_quote_double [ lex::_state = "string_double" ] // how ignore + ctx.more()? | ... ; this->self("string_double") = lex::token_def<>("[^\\\"]*") // action: ignore + ctx.more() | lex::token_def<>("\\\"\\\"") // how set token value '"' ? | lex::token_def<>("\\\"") [ lex::_state = "initial" ] ; } lex::token_def<> string_quote_double, ...; };
so how set token's value "
when ""
has been found?
apart have following question: can write functor semantic action call ctx.more() , ignore token @ same time (thus combining "low level" tokens "high level" string token). how elegantly combine lex::_state = ".." ?
edited in response comment, see below "update""
i suggest not trying solve in lexer. let lexer yield raw strings:
template <typename lexer> struct mylexer_t : lex::lexer<lexer> { mylexer_t() { string_quote_double = "\\\"([^\"]|\\\"\\\")*\\\""; this->self("initial") = string_quote_double | lex::token_def<>("[ \t\r\n]") [ lex::_pass = lex::pass_flags::pass_ignore ] ; } lex::token_def<std::string> string_quote_double; };
note exposing token attribute that, requires modified token typedef:
typedef lex::lexertl::token<char const*, boost::mpl::vector<char, std::string> > token_type; typedef lex::lexertl::actor_lexer<token_type> lexer_type;
postprocess in parser:
template <typename iterator> struct mygrammar_t : public qi::grammar<iterator, std::vector<std::string>()> { typedef mygrammar_t<iterator> this; template <typename tokendef> mygrammar_t(tokendef const& tok) : mygrammar_t::base_type(start) { using namespace qi; string_quote_double %= tok.string_quote_double [ undoublequote ]; start = *string_quote_double; boost_spirit_debug_nodes((start)(string_quote_double)); } private: qi::rule<iterator, std::vector<std::string>()> start; qi::rule<iterator, std::string()> string_quote_double; };
as can see, undoubleqoute
can phoenix actor satisfies criteria spirit semantic action. brain-dead example implementation be:
static bool undoublequote(std::string& val) { auto outidx = 0; for(auto in = val.begin(); in!=val.end(); ++in) { switch(*in) { case '"': if (++in == val.end()) { // eat escape // end of input reached val.resize(outidx); // resize effective chars return true; } // fall through default: val[outidx++] = *in; // append character } } return false; // not ended double quote expected }
but suggest write "proper" de-escaper (as i'm pretty sure mysql allow \t
, \r
, \u001e
or more archaic stuff well).
i have more complete samples in old answers here:
- todo
- links
- here's search page many related answers using spirit
update
in fact, indicated, easy integrate attribute value normalization lexer itself:
template <typename lexer> struct mylexer_t : lex::lexer<lexer> { struct undoublequote_lex_type { template <typename, typename, typename, typename> struct result { typedef void type; }; template <typename it, typename idtype, typename pass_flag, typename ctx> void operator()(it& f, it& l, pass_flag& pass, idtype& id, ctx& ctx) const { std::string raw(f,l); if (undoublequote(raw)) ctx.set_value(raw); else pass = lex::pass_flags::pass_fail; } } undoublequote_lex; mylexer_t() { string_quote_double = "\\\"([^\"]|\\\"\\\")*\\\""; const static undoublequote_lex_type undoublequote_lex; this->self("initial") = string_quote_double [ undoublequote_lex ] | lex::token_def<>("[ \t\r\n]") [ lex::_pass = lex::pass_flags::pass_ignore ] ; } lex::token_def<std::string> string_quote_double; };
this reuses same undoublequote
function shown above, wraps in deferred callable object (or "polymorphic functor") undoublequote_lex_type
satisfies the criteria lexer semantic action.
here working proof of concept:
//#include <boost/config/warning_disable.hpp> //#define boost_spirit_debug_print_some 80 //#define boost_spirit_debug // before including spirit #include <boost/spirit/include/lex_lexertl.hpp> #include <boost/spirit/include/qi.hpp> #include <fstream> #ifdef memory_mapped # include <boost/iostreams/device/mapped_file.hpp> #endif //#include <boost/spirit/include/lex_generate_static_lexertl.hpp> namespace /*anon*/ { namespace phx=boost::phoenix; namespace qi =boost::spirit::qi; namespace lex=boost::spirit::lex; template <typename lexer> struct mylexer_t : lex::lexer<lexer> { mylexer_t() { string_quote_double = "\\\"([^\"]|\\\"\\\")*\\\""; this->self("initial") = string_quote_double | lex::token_def<>("[ \t\r\n]") [ lex::_pass = lex::pass_flags::pass_ignore ] ; } lex::token_def<std::string> string_quote_double; }; static bool undoublequote(std::string& val) { auto outidx = 0; for(auto in = val.begin(); in!=val.end(); ++in) { switch(*in) { case '"': if (++in == val.end()) { // eat escape // end of input reached val.resize(outidx); // resize effective chars return true; } // fall through default: val[outidx++] = *in; // append character } } return false; // not ended double quote expected } template <typename iterator> struct mygrammar_t : public qi::grammar<iterator, std::vector<std::string>()> { typedef mygrammar_t<iterator> this; template <typename tokendef> mygrammar_t(tokendef const& tok) : mygrammar_t::base_type(start) { using namespace qi; string_quote_double %= tok.string_quote_double [ undoublequote ]; start = *string_quote_double; boost_spirit_debug_nodes((start)(string_quote_double)); } private: qi::rule<iterator, std::vector<std::string>()> start; qi::rule<iterator, std::string()> string_quote_double; }; } std::vector<std::string> do_test_parse(const std::string& v) { char const *first = &v[0]; char const *last = first+v.size(); typedef lex::lexertl::token<char const*, boost::mpl::vector<char, std::string> > token_type; typedef lex::lexertl::actor_lexer<token_type> lexer_type; typedef mylexer_t<lexer_type>::iterator_type iterator_type; const static mylexer_t<lexer_type> mylexer; const static mygrammar_t<iterator_type> parser(mylexer); auto iter = mylexer.begin(first, last); auto end = mylexer.end(); std::vector<std::string> data; bool r = qi::parse(iter, end, parser, data); r = r && (iter == end); if (!r) std::cerr << "parsing (" << iter->state() << ") failed at: '" << std::string(first, last) << "'\n"; return data; } int main(int argc, const char *argv[]) { (auto&& s : do_test_parse( "\"bla\"\"blo\"")) std::cout << s << std::endl; }
Comments
Post a Comment