i want tokenize own extension of sql syntax. involves recognizing escaped double quote inside double quoted string. e.g. in mysql these 2 string tokens equivalent: """" (the second double quote acts escape character) , '"'. have tried different things stuck @ how replace token's value.

#include <boost/spirit/include/lex_lexertl.hpp> namespace lex = boost::spirit::lex;  template <typename lexer> struct sql_tokens : lex::lexer<lexer> {   sql_tokens()   {     string_quote_double = "\\\"";    // '"'      this->self("initial")       = string_quote_double [ lex::_state = "string_double" ] // how ignore + ctx.more()?       | ...       ;      this->self("string_double")        = lex::token_def<>("[^\\\"]*") // action: ignore + ctx.more()       | lex::token_def<>("\\\"\\\"") // how set token value '"' ?       | lex::token_def<>("\\\"") [ lex::_state = "initial" ]       ;   }    lex::token_def<> string_quote_double, ...; };

so how set token's value " when "" has been found?

apart have following question: can write functor semantic action call ctx.more() , ignore token @ same time (thus combining "low level" tokens "high level" string token). how elegantly combine lex::_state = ".." ?

edited in response comment, see below "update""

i suggest not trying solve in lexer. let lexer yield raw strings:

template <typename lexer>     struct mylexer_t : lex::lexer<lexer> {     mylexer_t()     {         string_quote_double = "\\\"([^\"]|\\\"\\\")*\\\"";          this->self("initial")             = string_quote_double             | lex::token_def<>("[ \t\r\n]") [ lex::_pass = lex::pass_flags::pass_ignore ]             ;     }      lex::token_def<std::string> string_quote_double; };

note exposing token attribute that, requires modified token typedef:

typedef lex::lexertl::token<char const*, boost::mpl::vector<char, std::string> > token_type; typedef lex::lexertl::actor_lexer<token_type> lexer_type;

postprocess in parser:

template <typename iterator> struct mygrammar_t     : public qi::grammar<iterator, std::vector<std::string>()> {     typedef mygrammar_t<iterator> this;      template <typename tokendef>         mygrammar_t(tokendef const& tok) : mygrammar_t::base_type(start)     {         using namespace qi;          string_quote_double %= tok.string_quote_double [ undoublequote ];         start = *string_quote_double;          boost_spirit_debug_nodes((start)(string_quote_double));     }    private:     qi::rule<iterator, std::vector<std::string>()> start;     qi::rule<iterator, std::string()> string_quote_double; };

as can see, undoubleqoute can phoenix actor satisfies criteria spirit semantic action. brain-dead example implementation be:

static bool undoublequote(std::string& val) {     auto outidx = 0;     for(auto in = val.begin(); in!=val.end(); ++in) {         switch(*in) {             case '"':                  if (++in == val.end()) { // eat escape                     // end of input reached                     val.resize(outidx); // resize effective chars                     return true;                 }                 // fall through             default:                 val[outidx++] = *in; // append character         }     }      return false; // not ended double quote expected }

but suggest write "proper" de-escaper (as i'm pretty sure mysql allow \t, \r, \u001e or more archaic stuff well).

i have more complete samples in old answers here:

todo
links
here's search page many related answers using spirit

update

in fact, indicated, easy integrate attribute value normalization lexer itself:

template <typename lexer>     struct mylexer_t : lex::lexer<lexer> {     struct undoublequote_lex_type {         template <typename, typename, typename, typename> struct result { typedef void type; };          template <typename it, typename idtype, typename pass_flag, typename ctx>             void operator()(it& f, it& l, pass_flag& pass, idtype& id, ctx& ctx) const {                 std::string raw(f,l);                 if (undoublequote(raw))                     ctx.set_value(raw);                 else                     pass = lex::pass_flags::pass_fail;             }     } undoublequote_lex;      mylexer_t()     {         string_quote_double = "\\\"([^\"]|\\\"\\\")*\\\"";          const static undoublequote_lex_type undoublequote_lex;         this->self("initial")             = string_quote_double [ undoublequote_lex ]             | lex::token_def<>("[ \t\r\n]") [ lex::_pass = lex::pass_flags::pass_ignore ]             ;     }      lex::token_def<std::string> string_quote_double; };

this reuses same undoublequote function shown above, wraps in deferred callable object (or "polymorphic functor") undoublequote_lex_type satisfies the criteria lexer semantic action.

here working proof of concept:

//#include <boost/config/warning_disable.hpp> //#define boost_spirit_debug_print_some 80 //#define boost_spirit_debug // before including spirit #include <boost/spirit/include/lex_lexertl.hpp> #include <boost/spirit/include/qi.hpp> #include <fstream> #ifdef memory_mapped #   include <boost/iostreams/device/mapped_file.hpp> #endif //#include <boost/spirit/include/lex_generate_static_lexertl.hpp>  namespace /*anon*/ {     namespace phx=boost::phoenix;     namespace qi =boost::spirit::qi;     namespace lex=boost::spirit::lex;      template <typename lexer>         struct mylexer_t : lex::lexer<lexer>     {         mylexer_t()         {             string_quote_double = "\\\"([^\"]|\\\"\\\")*\\\"";              this->self("initial")                 = string_quote_double                 | lex::token_def<>("[ \t\r\n]") [ lex::_pass = lex::pass_flags::pass_ignore ]                 ;         }          lex::token_def<std::string> string_quote_double;     };      static bool undoublequote(std::string& val)     {         auto outidx = 0;         for(auto in = val.begin(); in!=val.end(); ++in) {             switch(*in) {                 case '"':                      if (++in == val.end()) { // eat escape                         // end of input reached                         val.resize(outidx); // resize effective chars                         return true;                     }                     // fall through                 default:                     val[outidx++] = *in; // append character             }         }          return false; // not ended double quote expected     }      template <typename iterator> struct mygrammar_t         : public qi::grammar<iterator, std::vector<std::string>()>     {         typedef mygrammar_t<iterator> this;          template <typename tokendef>             mygrammar_t(tokendef const& tok) : mygrammar_t::base_type(start)         {             using namespace qi;              string_quote_double %= tok.string_quote_double [ undoublequote ];             start = *string_quote_double;              boost_spirit_debug_nodes((start)(string_quote_double));         }        private:         qi::rule<iterator, std::vector<std::string>()> start;         qi::rule<iterator, std::string()> string_quote_double;     }; }  std::vector<std::string> do_test_parse(const std::string& v) {     char const *first = &v[0];     char const *last = first+v.size();      typedef lex::lexertl::token<char const*, boost::mpl::vector<char, std::string> > token_type;     typedef lex::lexertl::actor_lexer<token_type> lexer_type;      typedef mylexer_t<lexer_type>::iterator_type iterator_type;     const static mylexer_t<lexer_type> mylexer;     const static mygrammar_t<iterator_type> parser(mylexer);      auto iter = mylexer.begin(first, last);     auto end = mylexer.end();      std::vector<std::string> data;     bool r = qi::parse(iter, end, parser, data);      r = r && (iter == end);      if (!r)         std::cerr << "parsing (" << iter->state() << ") failed at: '" << std::string(first, last) << "'\n";      return data; }  int main(int argc, const char *argv[]) {     (auto&& s : do_test_parse( "\"bla\"\"blo\""))         std::cout << s << std::endl; }

Search This Blog

DIs

c++ - how to get rid of escape character in a token with spirit::lex? -

update

Comments

Post a Comment

Popular posts from this blog

php - cannot display multiple markers in google maps v3 from traceroute result -

css - Text drops down with smaller window -

php - Boolean search on database with 5 million rows, very slow -