/***************************************************************************
 *
 * scanner.cpp
 *
 * $Id: scanner.cpp 648752 2008-04-16 17:01:56Z faridz $
 *
 ***************************************************************************
 *
 * Licensed to the Apache Software  Foundation (ASF) under one or more
 * contributor  license agreements.  See  the NOTICE  file distributed
 * with  this  work  for  additional information  regarding  copyright
 * ownership.   The ASF  licenses this  file to  you under  the Apache
 * License, Version  2.0 (the  "License"); you may  not use  this file
 * except in  compliance with the License.   You may obtain  a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the  License is distributed on an  "AS IS" BASIS,
 * WITHOUT  WARRANTIES OR CONDITIONS  OF ANY  KIND, either  express or
 * implied.   See  the License  for  the  specific language  governing
 * permissions and limitations under the License.
 *
 * Copyright 2001-2006 Rogue Wave Software.
 * 
 **************************************************************************/

#include "scanner.h"

#include "diagnostic.h"
#include "loc_exception.h"

#include <fstream>
#include <string>
#include <vector>

#include <cassert>   // for assert()
#include <climits>   // for UCHAR_MAX
#include <cstdlib>   // for strtol()
#include <cstring>   // for strcmp()


struct ScannerContext
{
    ScannerContext (const char*, char = '#', char = '\\');
 
    std::ifstream file;            // file stream object
    std::string   filename;        // filename

    // comment and escape for current file
    char comment_char;
    char escape_char;

    // current line and column for the scanner
    int line;

    // current line and position within it
    std::string line_;
    const char* pos_;

private:
    // not defined (not copy constructible or assignable)
    ScannerContext (const ScannerContext&);
    void operator= (ScannerContext&);
};

/**************************************************************************/
// helpers

static void normal_path (std::string& s)
{
    std::string::iterator it(s.begin ());

    for (; it != s.end (); it++)
        if (*it == '/' || *it == '\\') {
#if defined (_MSC_VER)
            *it = '\\';
#else
            *it = '/';
#endif            
        }
}

/**************************************************************************/
// ScannerContext class definitions

ScannerContext::
ScannerContext (const char* name, char cc, char ec)
    : file (name),  filename (name),
      comment_char (cc), escape_char (ec),
    line (0) // , column (0)
{
    // update current position
    pos_ = line_.c_str ();

    if (!file.is_open ())
        issue_diag (500, true, 0, 
                    "%s could not be opened for reading\n", name);

    issue_diag (I_OPENRD, false, 0, "reading %s\n", name);
}

/**************************************************************************/
// Scanner class definitions

Scanner::
Scanner ()
    : context_ (0), nlines_ (0), ntokens_ (0), escaped_newline_ (false)
{
    // no-op
}


Scanner::
~Scanner() 
{
    // empty the stack and destroy the current state
    delete context_;

    while (!context_stack_.empty ()) {
        delete context_stack_.top ();
        context_stack_.pop ();
    }
}


char Scanner::
escape_char () const
{
    return context_ ? context_->escape_char : 0;
}

void Scanner::
ignore_line ()
{
    while (next_token ().token != tok_nl);
}


void Scanner::
open (std::string name, char cc, char ec)
{
    normal_path (name);

    if (context_)
        context_stack_.push (context_);

    try {
        context_ = new ScannerContext (name.c_str (), cc, ec);
    }
    catch (loc_exception&) {
        context_ = 0;

        if (!context_stack_.empty ()) {
            context_ = context_stack_.top ();
            context_stack_.pop ();
        }

        throw;
    }

    nlines_ = 0;
    ntokens_ = 0;
}


void Scanner::
close ()
{
    assert (0 != context_);

    issue_diag (I_OPENRD, false, 0,
                "%s: %u tokens, %u lines\n",
                context_->filename.c_str (), ntokens_, nlines_);

    delete context_;

    if (context_stack_.empty ())
        context_ = 0;
    else {
        context_ = context_stack_.top ();
        context_stack_.pop ();
    }
}


Scanner::token_id Scanner::
process_token (const char* name)
{
    assert (0 != name);

    if (*name == context_->escape_char) {
        switch (name [1]) {
        case '0': case '1': case '2': case '3':
        case '4': case '5': case '6': case '7':
        case 'd':
        case 'x':
            // escaped numeric character value
            return tok_char_value;
        default:
            break;
        }

        return tok_ndef;
    }

    // look for a predefined token

    static const struct {
        const char*       name;
        Scanner::token_id token;
    } tok_map [] = {
        // elements must be sorted in ascending order
        { "CHARMAP", tok_charmap },
        { "END", tok_end },
        { "IGNORE", tok_ignore },
        { "LC_ADDRESS", tok_addr },
        { "LC_COLLATE", tok_collate },
        { "LC_CTYPE", tok_ctype },
        { "LC_IDENTIFICATION", tok_ident },
        { "LC_MEASUREMENT", tok_measure },
        { "LC_MESSAGES", tok_messages },
        { "LC_MONETARY", tok_monetary },
        { "LC_NAME", tok_name },
        { "LC_NUMERIC", tok_numeric },
        { "LC_PAPER", tok_paper },
        { "LC_TELEPHONE", tok_phone },
        { "LC_TIME", tok_time },
        { "UNDEFINED", tok_undefined },
        { "WIDTH", tok_width },
        { "abday", tok_abday },
        { "abmon", tok_abmon },
        { "alpha", tok_alpha },
        { "alt_digits", tok_alt_digits },
        { "am_pm", tok_am_pm },
        { "backward", tok_backward },
        { "blank", tok_blank },
        { "cntrl", tok_cntrl },
        { "collating-element", tok_coll_elem },
        { "collating-symbol", tok_coll_sym },
        { "comment_char", tok_comment_char },
        { "copy", tok_copy },
        { "currency_symbol", tok_currency_symbol },
        { "d_fmt", tok_d_fmt },
        { "d_t_fmt", tok_d_t_fmt },
        { "day", tok_day },
        { "decimal_point", tok_decimal_point },
        { "digit", tok_digit },
        { "era", tok_era },
        { "era_d_fmt", tok_era_d_fmt },
        { "era_d_t_fmt", tok_era_d_t_fmt },
        { "era_t_fmt", tok_era_t_fmt },
        { "escape_char", tok_escape_char },
        { "falsename", tok_falsename },
        { "forward", tok_forward },
        { "frac_digits", tok_frac_digits },
        { "from", tok_from },
        { "graph", tok_graph },
        { "grouping", tok_grouping },
        { "include", tok_include },
        { "int_curr_symbol", tok_int_curr_symbol },
        { "int_frac_digits", tok_int_frac_digits },
        { "int_n_cs_precedes", tok_int_n_cs_precedes },
        { "int_n_sep_by_space", tok_int_n_sep_by_space },
        { "int_n_sign_posn", tok_int_n_sign_posn },
        { "int_p_cs_precedes", tok_int_p_cs_precedes },
        { "int_p_sep_by_space", tok_int_p_sep_by_space },
        { "int_p_sign_posn", tok_int_p_sign_posn },
        { "lower", tok_lower },
        { "mon", tok_mon },
        { "mon_decimal_point", tok_mon_decimal_point },
        { "mon_grouping", tok_mon_grouping },
        { "mon_thousands_sep", tok_mon_thousands_sep },
        { "n_cs_precedes", tok_n_cs_precedes },
        { "n_sep_by_space", tok_n_sep_by_space },
        { "n_sign_posn", tok_n_sign_posn },
        { "negative_sign", tok_negative_sign },
        { "noexpr", tok_noexpr },
        { "order_end", tok_order_end },
        { "order_start", tok_order_start },
        { "p_cs_precedes", tok_p_cs_precedes },
        { "p_sep_by_space", tok_p_sep_by_space },
        { "p_sign_posn", tok_p_sign_posn },
        { "position", tok_position },
        { "positive_sign", tok_positive_sign },
        { "print", tok_print },
        { "punct", tok_punct },
        { "reorder-after", tok_reorder },
        { "reorder-end", tok_reorder_end },
        { "reorder-section-after", tok_reorder_section },
        { "reorder-section-end", tok_reorder_section_end },
        { "script", tok_script },
        { "space", tok_space },
        { "t_fmt", tok_t_fmt },
        { "t_fmt_ampm", tok_t_fmt_ampm },
        { "thousands_sep", tok_thousands_sep },
        { "tolower", tok_tolower },
        { "toupper", tok_toupper },
        { "translit_end", tok_xlit_end },
        { "translit_start", tok_xlit_start },
        { "truename", tok_truename },
        { "upper", tok_upper },
        { "xdigit", tok_xdigit },
        { "yesexpr", tok_yesexpr }
    };

    int low  = 0;
    int high = sizeof tok_map / sizeof *tok_map - 1;

    // this loop implements a binary search to find 'name' in the   
    // tok_map list and when found returns the token value.
    while (low <= high) {

        const int cur = (low + high) / 2;

        const int cmp = std::strcmp (name, tok_map [cur].name);
        if (0 == cmp)
            return tok_map [cur].token;

        if (cmp < 0)
            high = cur - 1;
        else
            low = cur + 1;
    }

    return tok_ndef;
}


void Scanner::
read_line ()
{
    context_->line_.clear ();

    std::getline (context_->file, context_->line_);

    context_->line_ += '\n';
      
    context_->pos_ = context_->line_.c_str ();

    ++context_->line;
    // context_->column = 0;

    ++nlines_;
    
    assert (context_->line_.size ());
}


Scanner::token_t Scanner::
next_token ()
{
    assert (0 != context_);
    assert (context_->file.is_open ());

    // token
    token_t next_tok;

    next_tok.name   = "";
    next_tok.token  = tok_ndef;
    next_tok.line   = 0;
    next_tok.column = 0;
    next_tok.file   = 0;

    while (true) {

        // store the *current* file name
        next_tok.file = context_->filename.c_str ();

        // the assert above for eof checks if the caller has lost it;
        if (context_->file.eof ()) {
            next_tok.token = tok_end_tokens;
            return next_tok;
        }

        // if we exhausted the current line, advance
        if (   context_->line_.size ()
            <= std::size_t (context_->pos_ - context_->line_.c_str ())) {
            read_line ();
        }

        // line and column for the token start; they are set at each
        // iteration; the finding of a token breaks and next_tok leaves
        // this loop having the line/col info
        next_tok.line   = context_->line;
        next_tok.column = context_->pos_ - context_->line_.c_str ();

        // plug in the pointer to current position
        const char*& next = context_->pos_;

        if (*next != context_->comment_char)
            escaped_newline_ = false;

        if (*next == '<') {
            // beginning of a symbolic name or keyword
            const char* tok_begin = next++;

            for (; '>' != *next; ++next) {

                // if has an escaped close angular, pass
                if (*next == context_->escape_char) {

                    // append symbol name up to but not including the escape
                    next_tok.name.append (tok_begin, next - tok_begin);

                    // advance the next pointer to skip the escape
                    tok_begin = ++next;
                }
                else if ('\n' == *next) {
                    // past the end of the line
                    issue_diag (E_SYNTAX, true, &next_tok, 
                                " unterminated symbolic name\n");
                    break;
                }
            }

            next_tok.name.append (tok_begin, ++next - tok_begin);

            // check the name fetched so far
            if (next_tok.name == "<code_set_name>") {
                next_tok.token = tok_code_set_name;
            }
            else if (   next_tok.name == "<escape_char>"
                     || next_tok.name == "<comment_char>") {

                // eat away spaces
                while (' ' == *next || '\t' == *next) {
                    ++next;
                }

                // test for end of line
                if (*next == '\n')
                    issue_diag (E_SYNTAX, true, &next_tok, 
                                "missing value for %s\n",
                                next_tok.name.c_str ());

                // store character
                if (next_tok.name == "<escape_char>")
                    context_->escape_char = *next;
                else
                    context_->comment_char = *next;

                // adjust positions;
                context_->pos_ =
                    context_->line_.c_str () + context_->line_.size ();

                // set token to a newline
                next_tok.name = "";
                next_tok.token = tok_nl;
            }
            else if (next_tok.name == "<mb_cur_max>") {
                next_tok.token = tok_mb_cur_max;
            }
            else if (next_tok.name == "<mb_cur_min>") {
                next_tok.token = tok_mb_cur_min;
            }
            else {
                next_tok.token = tok_sym_name;
            }
            break;
        }
        else if (*next == ' ' || *next == '\t' || *next == ';') {
            // ignore whitespace and separators
            while (*next == ' ' || *next == '\t' || *next == ';') {
                ++next;
            }
        }
        else if (*next == '\n') {
            ++next;
            next_tok.token = tok_nl;
            break;
        }
        else if (*next == context_->comment_char) {
            // start of a comment - check as early as necessary
            // adjust to end of line
            context_->pos_ = context_->line_.c_str () + context_->line_.size ();

            if (escaped_newline_)
                continue;

            next_tok.token = tok_nl;
            next_tok.name = "\n";
            break;
        }
        else if (*next == '(') {
            // push open parenthesis
            next_tok.name.push_back (*next++);

            // start of a grouping
            while (*next != ')') {
                // contains a symbolic name
                if (*next == '<') {
                    // push open angular parenthesis
                    next_tok.name.push_back (*next++);

                    while (*next != '\n') {
                        // if has an escaped close angular, pass
                        if (next [0] == context_->escape_char) {
                            next_tok.name.push_back (*next++);
                            next_tok.name.push_back (*next++);
                            continue;
                        }

                        // if we have reached the end of the sym name
                        if (*next == '>') {
                            next_tok.name.push_back (*next);
                            break;
                        }

                        // still inside the sym name/keyword
                        next_tok.name.push_back (*next++);
                    }

                    // check if we have gone past the end of the line
                    if (*next == '\n')
                        issue_diag (E_SYNTAX, true, &next_tok, 
                                    " unterminated symbolic name");

                    ++next;
                }
                else {
                    // fetch the character
                    next_tok.name.push_back (*next++);
                }

                if (*next == '\n')
                    issue_diag (E_SYNTAX, true, &next_tok, 
                                " unterminated grouping ");
            }

            next_tok.name.push_back (*next++);
            next_tok.token = tok_grouping;
            break;
        } 
        else if (*next == '.') {
            // ellipsis (see ISO/IEC TR 14652)
            int ellipsis_count = 0;
            // start of an interval
            while (*next == '.') {
                next_tok.name.push_back (*next++);
                ++ellipsis_count;
            }

            switch (ellipsis_count) {
            case 2: {
                const char* tmp = next;
                if (*tmp++ == '(' && *tmp++ == '2' && *tmp++ == ')'
                    && *tmp++ == '.' && *tmp++ == '.') {
                    // double increment hexadecimal symbolic ellipsis
                    next_tok.token = tok_dbl_ellipsis;
                    next = tmp;
                }
                else {
                    // hexadecimal symbolic ellipsis
                    next_tok.token = tok_hex_ellipsis;
                }
                break;
            }

            case 3:
                // absolute symbolic ellipsis
                next_tok.token = tok_abs_ellipsis;
                break;

            case 4:
                // decimal symbolic ellipsis
                next_tok.token = tok_dec_ellipsis;
                break;

            default:
                issue_diag (E_SYNTAX, true, &next_tok, "illegal ellipsis\n");
            }
            break;

        } 
        else if (*next == '\"') {

            // start of a string
            next_tok.name.push_back (*next++);
            const char ec = context_->escape_char;

            while (next[0] != '\n') {

                // escaped newline; continue
                if (next [0] == ec && next [1] == '\n') {
                    read_line ();
                    continue;
                }

                // escaped quote
                if (next[0] == ec) {
                    next_tok.name.push_back (*next++);
                    next_tok.name.push_back (*next++);
                    continue;
                }

                if (next [0] == '\"') {
                    next_tok.name.push_back (*next);
                    break;
                }

                // still inside the string
                next_tok.name.push_back (*next++);
            }

            // test for closure
            if (*next == '\n')
                issue_diag (E_SYNTAX, true, &next_tok, "unterminated string");
            
            ++next;
            next_tok.token = tok_string;
            break;

        }
        else if (*next == context_->escape_char) {
            // start of an escape sequence
            // escaped new line
            if (next [1] == '\n') {
                // adjust to end of line
                context_->pos_ =
                    context_->line_.c_str () + context_->line_.size ();

                escaped_newline_ = true;
                continue;
            }

            // or
            while (   *next != ' ' && *next != '\t'
                   && *next != ';' && *next != '\n') {
                next_tok.name.push_back (*next++);
            }

            // retrieve token based on value
            next_tok.token = process_token (next_tok.name.c_str ());
            break;
        }
        else {
            // the rest of it
            for (const char ec = context_->escape_char; ; ) {

                // stop at esc-newline or at first "separator"
                if (   (next [0] == ec && next [1] == '\n')
                    || next [0] == ' '
                    || next [0] == '\t' 
                    || next [0] == '\n'
                    || next [0] == ';') {
                    // continuation of a line, separators
                    break;
                } 
                
                // fetch characters
                next_tok.name.push_back (*next++);
            }

            // assert length of input
            assert (next_tok.name.size ());

            // it wasn't a locale definition keyword so call process_token
            // and add the result to the list
            next_tok.token = process_token (next_tok.name.c_str ());
            

            // an extra bit of processing since we keep comment and escape
            // characters in the scanner for a faster processing
            if (   next_tok.token == tok_escape_char
                || next_tok.token == tok_comment_char) {

                // eat away spaces
                while (' ' == *next || '\t' == *next) {
                    ++next;
                }

                // test for end of line
                if (*next == '\n')
                    issue_diag (E_SYNTAX, true, &next_tok,
                                "unterminated statement");

                // store character
                if (next_tok.token == tok_escape_char)
                    context_->escape_char = next [0];
                else
                    context_->comment_char = next [0];

                // adjust positions;
                context_->pos_ =
                    context_->line_.c_str () + context_->line_.size ();

                // return the token
                next_tok.name = "";
                next_tok.token = tok_nl;
            }

            break;
        }
    }

    ++ntokens_;
    
    return next_tok;
}


unsigned long Scanner::
convert_escape (const char  *esc,
                const char **pend  /* = 0 */,
                bool         multi /* = false */) const
{
    assert (0 != esc);

    const char escape = escape_char ();

    if (escape != *esc)
        issue_diag (E_SYNTAX, true, 0,
                    "expected the escape character ('%c'), got \"%s\"\n",
                    escape, esc);

    unsigned long value = 0;

    for (const char *s = esc; ; ) {

        // escaped characters are octal by default
        const char *basename = "octal";
        int         base     = 8;

        switch (*++s) {
        case 'd': ++s; base = 10; basename = "decimal"; break;
        case 'x': ++s; base = 16; basename = "hexadecimal"; break;

        case 'o': ++s;
        case '0': case '1': case '2': case '3':
        case '4': case '5': case '6': case '7':
            break;

        default:
            issue_diag (E_SYNTAX, true, 0,
                        "one of { 'o', 'd', 'x' } expected following "
                        "the escape character: %s\n", esc);
        }

        char *end = 0;

        const unsigned long byte = std::strtoul (s, &end, base);

        if (pend)
            *pend = end;

        // cast away constness below to work around an MSVC 7.0 bug:
        // causing error C2446: '==' : no conversion from 'char ** '
        // to 'const char ** ' Conversion loses qualifiers
        if (!multi && _RWSTD_CONST_CAST (char**, pend) == &end && **pend)
            issue_diag (E_SYNTAX, true, 0,
                        "%s constant expected: %s\n", basename, esc);

        if (UCHAR_MAX < byte)
            issue_diag (E_INVAL, true, 0,
                        "%s byte value must be in the range [0, %d]: %s\n",
                        basename, int (UCHAR_MAX), esc);

        if (value >> (sizeof (unsigned long) - 1) * CHAR_BIT)
            issue_diag (E_INVAL, true, 0, "integer overflow: %s\n", esc);

        value = (value << CHAR_BIT) | byte;

        if (**pend != escape || !multi)
            break;

        s = *pend;
    }

    return value;
}