/*************************************************************************** * * scanner.cpp * * $Id: scanner.cpp 648752 2008-04-16 17:01:56Z faridz $ * *************************************************************************** * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed * with this work for additional information regarding copyright * ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. * * Copyright 2001-2006 Rogue Wave Software. * **************************************************************************/ #include "scanner.h" #include "diagnostic.h" #include "loc_exception.h" #include #include #include #include // for assert() #include // for UCHAR_MAX #include // for strtol() #include // for strcmp() struct ScannerContext { ScannerContext (const char*, char = '#', char = '\\'); std::ifstream file; // file stream object std::string filename; // filename // comment and escape for current file char comment_char; char escape_char; // current line and column for the scanner int line; // current line and position within it std::string line_; const char* pos_; private: // not defined (not copy constructible or assignable) ScannerContext (const ScannerContext&); void operator= (ScannerContext&); }; /**************************************************************************/ // helpers static void normal_path (std::string& s) { std::string::iterator it(s.begin ()); for (; it != s.end (); it++) if (*it == '/' || *it == '\\') { #if defined (_MSC_VER) *it = '\\'; #else *it = '/'; #endif } } /**************************************************************************/ // ScannerContext class definitions ScannerContext:: ScannerContext (const char* name, char cc, char ec) : file (name), filename (name), comment_char (cc), escape_char (ec), line (0) // , column (0) { // update current position pos_ = line_.c_str (); if (!file.is_open ()) issue_diag (500, true, 0, "%s could not be opened for reading\n", name); issue_diag (I_OPENRD, false, 0, "reading %s\n", name); } /**************************************************************************/ // Scanner class definitions Scanner:: Scanner () : context_ (0), nlines_ (0), ntokens_ (0), escaped_newline_ (false) { // no-op } Scanner:: ~Scanner() { // empty the stack and destroy the current state delete context_; while (!context_stack_.empty ()) { delete context_stack_.top (); context_stack_.pop (); } } char Scanner:: escape_char () const { return context_ ? context_->escape_char : 0; } void Scanner:: ignore_line () { while (next_token ().token != tok_nl); } void Scanner:: open (std::string name, char cc, char ec) { normal_path (name); if (context_) context_stack_.push (context_); try { context_ = new ScannerContext (name.c_str (), cc, ec); } catch (loc_exception&) { context_ = 0; if (!context_stack_.empty ()) { context_ = context_stack_.top (); context_stack_.pop (); } throw; } nlines_ = 0; ntokens_ = 0; } void Scanner:: close () { assert (0 != context_); issue_diag (I_OPENRD, false, 0, "%s: %u tokens, %u lines\n", context_->filename.c_str (), ntokens_, nlines_); delete context_; if (context_stack_.empty ()) context_ = 0; else { context_ = context_stack_.top (); context_stack_.pop (); } } Scanner::token_id Scanner:: process_token (const char* name) { assert (0 != name); if (*name == context_->escape_char) { switch (name [1]) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case 'd': case 'x': // escaped numeric character value return tok_char_value; default: break; } return tok_ndef; } // look for a predefined token static const struct { const char* name; Scanner::token_id token; } tok_map [] = { // elements must be sorted in ascending order { "CHARMAP", tok_charmap }, { "END", tok_end }, { "IGNORE", tok_ignore }, { "LC_ADDRESS", tok_addr }, { "LC_COLLATE", tok_collate }, { "LC_CTYPE", tok_ctype }, { "LC_IDENTIFICATION", tok_ident }, { "LC_MEASUREMENT", tok_measure }, { "LC_MESSAGES", tok_messages }, { "LC_MONETARY", tok_monetary }, { "LC_NAME", tok_name }, { "LC_NUMERIC", tok_numeric }, { "LC_PAPER", tok_paper }, { "LC_TELEPHONE", tok_phone }, { "LC_TIME", tok_time }, { "UNDEFINED", tok_undefined }, { "WIDTH", tok_width }, { "abday", tok_abday }, { "abmon", tok_abmon }, { "alpha", tok_alpha }, { "alt_digits", tok_alt_digits }, { "am_pm", tok_am_pm }, { "backward", tok_backward }, { "blank", tok_blank }, { "cntrl", tok_cntrl }, { "collating-element", tok_coll_elem }, { "collating-symbol", tok_coll_sym }, { "comment_char", tok_comment_char }, { "copy", tok_copy }, { "currency_symbol", tok_currency_symbol }, { "d_fmt", tok_d_fmt }, { "d_t_fmt", tok_d_t_fmt }, { "day", tok_day }, { "decimal_point", tok_decimal_point }, { "digit", tok_digit }, { "era", tok_era }, { "era_d_fmt", tok_era_d_fmt }, { "era_d_t_fmt", tok_era_d_t_fmt }, { "era_t_fmt", tok_era_t_fmt }, { "escape_char", tok_escape_char }, { "falsename", tok_falsename }, { "forward", tok_forward }, { "frac_digits", tok_frac_digits }, { "from", tok_from }, { "graph", tok_graph }, { "grouping", tok_grouping }, { "include", tok_include }, { "int_curr_symbol", tok_int_curr_symbol }, { "int_frac_digits", tok_int_frac_digits }, { "int_n_cs_precedes", tok_int_n_cs_precedes }, { "int_n_sep_by_space", tok_int_n_sep_by_space }, { "int_n_sign_posn", tok_int_n_sign_posn }, { "int_p_cs_precedes", tok_int_p_cs_precedes }, { "int_p_sep_by_space", tok_int_p_sep_by_space }, { "int_p_sign_posn", tok_int_p_sign_posn }, { "lower", tok_lower }, { "mon", tok_mon }, { "mon_decimal_point", tok_mon_decimal_point }, { "mon_grouping", tok_mon_grouping }, { "mon_thousands_sep", tok_mon_thousands_sep }, { "n_cs_precedes", tok_n_cs_precedes }, { "n_sep_by_space", tok_n_sep_by_space }, { "n_sign_posn", tok_n_sign_posn }, { "negative_sign", tok_negative_sign }, { "noexpr", tok_noexpr }, { "order_end", tok_order_end }, { "order_start", tok_order_start }, { "p_cs_precedes", tok_p_cs_precedes }, { "p_sep_by_space", tok_p_sep_by_space }, { "p_sign_posn", tok_p_sign_posn }, { "position", tok_position }, { "positive_sign", tok_positive_sign }, { "print", tok_print }, { "punct", tok_punct }, { "reorder-after", tok_reorder }, { "reorder-end", tok_reorder_end }, { "reorder-section-after", tok_reorder_section }, { "reorder-section-end", tok_reorder_section_end }, { "script", tok_script }, { "space", tok_space }, { "t_fmt", tok_t_fmt }, { "t_fmt_ampm", tok_t_fmt_ampm }, { "thousands_sep", tok_thousands_sep }, { "tolower", tok_tolower }, { "toupper", tok_toupper }, { "translit_end", tok_xlit_end }, { "translit_start", tok_xlit_start }, { "truename", tok_truename }, { "upper", tok_upper }, { "xdigit", tok_xdigit }, { "yesexpr", tok_yesexpr } }; int low = 0; int high = sizeof tok_map / sizeof *tok_map - 1; // this loop implements a binary search to find 'name' in the // tok_map list and when found returns the token value. while (low <= high) { const int cur = (low + high) / 2; const int cmp = std::strcmp (name, tok_map [cur].name); if (0 == cmp) return tok_map [cur].token; if (cmp < 0) high = cur - 1; else low = cur + 1; } return tok_ndef; } void Scanner:: read_line () { context_->line_.clear (); std::getline (context_->file, context_->line_); context_->line_ += '\n'; context_->pos_ = context_->line_.c_str (); ++context_->line; // context_->column = 0; ++nlines_; assert (context_->line_.size ()); } Scanner::token_t Scanner:: next_token () { assert (0 != context_); assert (context_->file.is_open ()); // token token_t next_tok; next_tok.name = ""; next_tok.token = tok_ndef; next_tok.line = 0; next_tok.column = 0; next_tok.file = 0; while (true) { // store the *current* file name next_tok.file = context_->filename.c_str (); // the assert above for eof checks if the caller has lost it; if (context_->file.eof ()) { next_tok.token = tok_end_tokens; return next_tok; } // if we exhausted the current line, advance if ( context_->line_.size () <= std::size_t (context_->pos_ - context_->line_.c_str ())) { read_line (); } // line and column for the token start; they are set at each // iteration; the finding of a token breaks and next_tok leaves // this loop having the line/col info next_tok.line = context_->line; next_tok.column = context_->pos_ - context_->line_.c_str (); // plug in the pointer to current position const char*& next = context_->pos_; if (*next != context_->comment_char) escaped_newline_ = false; if (*next == '<') { // beginning of a symbolic name or keyword const char* tok_begin = next++; for (; '>' != *next; ++next) { // if has an escaped close angular, pass if (*next == context_->escape_char) { // append symbol name up to but not including the escape next_tok.name.append (tok_begin, next - tok_begin); // advance the next pointer to skip the escape tok_begin = ++next; } else if ('\n' == *next) { // past the end of the line issue_diag (E_SYNTAX, true, &next_tok, " unterminated symbolic name\n"); break; } } next_tok.name.append (tok_begin, ++next - tok_begin); // check the name fetched so far if (next_tok.name == "") { next_tok.token = tok_code_set_name; } else if ( next_tok.name == "" || next_tok.name == "") { // eat away spaces while (' ' == *next || '\t' == *next) { ++next; } // test for end of line if (*next == '\n') issue_diag (E_SYNTAX, true, &next_tok, "missing value for %s\n", next_tok.name.c_str ()); // store character if (next_tok.name == "") context_->escape_char = *next; else context_->comment_char = *next; // adjust positions; context_->pos_ = context_->line_.c_str () + context_->line_.size (); // set token to a newline next_tok.name = ""; next_tok.token = tok_nl; } else if (next_tok.name == "") { next_tok.token = tok_mb_cur_max; } else if (next_tok.name == "") { next_tok.token = tok_mb_cur_min; } else { next_tok.token = tok_sym_name; } break; } else if (*next == ' ' || *next == '\t' || *next == ';') { // ignore whitespace and separators while (*next == ' ' || *next == '\t' || *next == ';') { ++next; } } else if (*next == '\n') { ++next; next_tok.token = tok_nl; break; } else if (*next == context_->comment_char) { // start of a comment - check as early as necessary // adjust to end of line context_->pos_ = context_->line_.c_str () + context_->line_.size (); if (escaped_newline_) continue; next_tok.token = tok_nl; next_tok.name = "\n"; break; } else if (*next == '(') { // push open parenthesis next_tok.name.push_back (*next++); // start of a grouping while (*next != ')') { // contains a symbolic name if (*next == '<') { // push open angular parenthesis next_tok.name.push_back (*next++); while (*next != '\n') { // if has an escaped close angular, pass if (next [0] == context_->escape_char) { next_tok.name.push_back (*next++); next_tok.name.push_back (*next++); continue; } // if we have reached the end of the sym name if (*next == '>') { next_tok.name.push_back (*next); break; } // still inside the sym name/keyword next_tok.name.push_back (*next++); } // check if we have gone past the end of the line if (*next == '\n') issue_diag (E_SYNTAX, true, &next_tok, " unterminated symbolic name"); ++next; } else { // fetch the character next_tok.name.push_back (*next++); } if (*next == '\n') issue_diag (E_SYNTAX, true, &next_tok, " unterminated grouping "); } next_tok.name.push_back (*next++); next_tok.token = tok_grouping; break; } else if (*next == '.') { // ellipsis (see ISO/IEC TR 14652) int ellipsis_count = 0; // start of an interval while (*next == '.') { next_tok.name.push_back (*next++); ++ellipsis_count; } switch (ellipsis_count) { case 2: { const char* tmp = next; if (*tmp++ == '(' && *tmp++ == '2' && *tmp++ == ')' && *tmp++ == '.' && *tmp++ == '.') { // double increment hexadecimal symbolic ellipsis next_tok.token = tok_dbl_ellipsis; next = tmp; } else { // hexadecimal symbolic ellipsis next_tok.token = tok_hex_ellipsis; } break; } case 3: // absolute symbolic ellipsis next_tok.token = tok_abs_ellipsis; break; case 4: // decimal symbolic ellipsis next_tok.token = tok_dec_ellipsis; break; default: issue_diag (E_SYNTAX, true, &next_tok, "illegal ellipsis\n"); } break; } else if (*next == '\"') { // start of a string next_tok.name.push_back (*next++); const char ec = context_->escape_char; while (next[0] != '\n') { // escaped newline; continue if (next [0] == ec && next [1] == '\n') { read_line (); continue; } // escaped quote if (next[0] == ec) { next_tok.name.push_back (*next++); next_tok.name.push_back (*next++); continue; } if (next [0] == '\"') { next_tok.name.push_back (*next); break; } // still inside the string next_tok.name.push_back (*next++); } // test for closure if (*next == '\n') issue_diag (E_SYNTAX, true, &next_tok, "unterminated string"); ++next; next_tok.token = tok_string; break; } else if (*next == context_->escape_char) { // start of an escape sequence // escaped new line if (next [1] == '\n') { // adjust to end of line context_->pos_ = context_->line_.c_str () + context_->line_.size (); escaped_newline_ = true; continue; } // or while ( *next != ' ' && *next != '\t' && *next != ';' && *next != '\n') { next_tok.name.push_back (*next++); } // retrieve token based on value next_tok.token = process_token (next_tok.name.c_str ()); break; } else { // the rest of it for (const char ec = context_->escape_char; ; ) { // stop at esc-newline or at first "separator" if ( (next [0] == ec && next [1] == '\n') || next [0] == ' ' || next [0] == '\t' || next [0] == '\n' || next [0] == ';') { // continuation of a line, separators break; } // fetch characters next_tok.name.push_back (*next++); } // assert length of input assert (next_tok.name.size ()); // it wasn't a locale definition keyword so call process_token // and add the result to the list next_tok.token = process_token (next_tok.name.c_str ()); // an extra bit of processing since we keep comment and escape // characters in the scanner for a faster processing if ( next_tok.token == tok_escape_char || next_tok.token == tok_comment_char) { // eat away spaces while (' ' == *next || '\t' == *next) { ++next; } // test for end of line if (*next == '\n') issue_diag (E_SYNTAX, true, &next_tok, "unterminated statement"); // store character if (next_tok.token == tok_escape_char) context_->escape_char = next [0]; else context_->comment_char = next [0]; // adjust positions; context_->pos_ = context_->line_.c_str () + context_->line_.size (); // return the token next_tok.name = ""; next_tok.token = tok_nl; } break; } } ++ntokens_; return next_tok; } unsigned long Scanner:: convert_escape (const char *esc, const char **pend /* = 0 */, bool multi /* = false */) const { assert (0 != esc); const char escape = escape_char (); if (escape != *esc) issue_diag (E_SYNTAX, true, 0, "expected the escape character ('%c'), got \"%s\"\n", escape, esc); unsigned long value = 0; for (const char *s = esc; ; ) { // escaped characters are octal by default const char *basename = "octal"; int base = 8; switch (*++s) { case 'd': ++s; base = 10; basename = "decimal"; break; case 'x': ++s; base = 16; basename = "hexadecimal"; break; case 'o': ++s; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': break; default: issue_diag (E_SYNTAX, true, 0, "one of { 'o', 'd', 'x' } expected following " "the escape character: %s\n", esc); } char *end = 0; const unsigned long byte = std::strtoul (s, &end, base); if (pend) *pend = end; // cast away constness below to work around an MSVC 7.0 bug: // causing error C2446: '==' : no conversion from 'char ** ' // to 'const char ** ' Conversion loses qualifiers if (!multi && _RWSTD_CONST_CAST (char**, pend) == &end && **pend) issue_diag (E_SYNTAX, true, 0, "%s constant expected: %s\n", basename, esc); if (UCHAR_MAX < byte) issue_diag (E_INVAL, true, 0, "%s byte value must be in the range [0, %d]: %s\n", basename, int (UCHAR_MAX), esc); if (value >> (sizeof (unsigned long) - 1) * CHAR_BIT) issue_diag (E_INVAL, true, 0, "integer overflow: %s\n", esc); value = (value << CHAR_BIT) | byte; if (**pend != escape || !multi) break; s = *pend; } return value; }