/***************************************************************************
 *
 * scanner.h
 *
 * $Id: scanner.h 648752 2008-04-16 17:01:56Z faridz $
 *
 ***************************************************************************
 *
 * Licensed to the Apache Software  Foundation (ASF) under one or more
 * contributor  license agreements.  See  the NOTICE  file distributed
 * with  this  work  for  additional information  regarding  copyright
 * ownership.   The ASF  licenses this  file to  you under  the Apache
 * License, Version  2.0 (the  "License"); you may  not use  this file
 * except in  compliance with the License.   You may obtain  a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the  License is distributed on an  "AS IS" BASIS,
 * WITHOUT  WARRANTIES OR CONDITIONS  OF ANY  KIND, either  express or
 * implied.   See  the License  for  the  specific language  governing
 * permissions and limitations under the License.
 *
 * Copyright 2001-2006 Rogue Wave Software.
 * 
 **************************************************************************/

#ifndef RWSTD_UTIL_SCANNER_H_INCLUDED
#define RWSTD_UTIL_SCANNER_H_INCLUDED

#include <string>
#include <stack>
#include <climits>   // for ULONG_MAX


struct ScannerContext;


class Scanner
{
public:

    // enumeration of all tokens in the character map 
    // and locale definition file
    enum token_id {
        tok_code_set_name,   // <code_set_name>
        tok_mb_cur_max,      // <mb_cur_max>
        tok_mb_cur_min,      // <mb_cur_min>
        // sections
        tok_charmap,         // beginning of CHARMAP section
        tok_collate,         // beginning of LC_COLLATE section
        tok_ctype,           // beginning of LC_CTYPE section
        tok_messages,        // beginning of LC_MESSAGES section
        tok_monetary,        // beginning of LC_MONETARY section
        tok_numeric,         // beginning of LC_NUMERIC section
        tok_time,            // beginning of LC_TIME section
        // ISO/IEC TR 14652 extensions:
        tok_addr,            // beginning of LC_ADDRESS section
        tok_ident,           // beginning of LC_IDENTIFICATION section
        tok_measure,         // beginning of LC_MEASUREMENT section
        tok_name,            // beginning of LC_NAME section
        tok_paper,           // beginning of LC_PAPER section
        tok_phone,           // beginning of LC_TELEPHONE section
        //
        tok_end,             // END of a section
        // LC_CTYPE-specific tokens
        tok_upper,           // upper section of LC_CTYPE
        tok_lower,           // lower section of LC_CTYPE
        tok_digit,           // digit section of LC_CTYPE
        tok_space,           // space section of LC_CTYPE
        tok_alpha,           // alpha section of LC_CTYPE
        tok_graph,           // graph section of LC_CTYPE
        tok_print,           // print section of LC_CTYPE
        tok_cntrl,           // cntrl section of LC_CTYPE
        tok_punct,           // punct section of LC_CTYPE
        tok_xdigit,          // xdigit section of LC_CTYPE
        tok_blank,           // blank section of LC_CTYPE
        tok_tolower,         // tolower section of LC_CTYPE
        tok_toupper,         // toupper section of LC_CTYPE
        // LC_COLLATE-specific tokens
        tok_script,
        tok_coll_elem,       // collating-element
        tok_coll_sym,        // collating symbol
        tok_from,
        tok_xlit_start,      // translit_start
        tok_xlit_end,        // translit_end
        tok_reorder,
        tok_reorder_end,
        tok_reorder_section,
        tok_reorder_section_end,
        tok_order_start,
        tok_order_end,
        tok_forward,
        tok_backward,
        tok_position,
        tok_undefined,
        // 
        tok_string,
        tok_ignore,
        // absolute, hexadecimal, decimal, and double-increment
        // ellipses (see ISO/IEC TR 14652)
        tok_abs_ellipsis,    // "..."
        tok_hex_ellipsis,    // ".."
        tok_dec_ellipsis,    // "...."
        tok_dbl_ellipsis,    // "..(N).."
        tok_width,
        // LC_MONETARY-specific tokens
        tok_int_curr_symbol,
        tok_currency_symbol, 
        tok_mon_decimal_point,
        tok_mon_thousands_sep, 
        tok_mon_grouping,
        tok_positive_sign,
        tok_negative_sign,
        tok_int_frac_digits,
        tok_frac_digits,
        tok_p_cs_precedes,
        tok_p_sep_by_space,
        tok_n_cs_precedes,
        tok_n_sep_by_space,
        tok_p_sign_posn,
        tok_n_sign_posn,
        tok_int_p_cs_precedes,
        tok_int_n_cs_precedes,
        tok_int_p_sep_by_space,
        tok_int_n_sep_by_space,
        tok_int_p_sign_posn,
        tok_int_n_sign_posn,
        // LC_NUMERIC-specific tokens
        tok_decimal_point,   // decimal point
        tok_thousands_sep,   // thousands_sep
        tok_grouping,        // grouping
        tok_truename,        // truename (C++ extension)
        tok_falsename,       // falsename (C++ extension)
        // LC_TIME-specific tokens
        tok_abday,
        tok_day,
        tok_abmon,
        tok_mon,
        tok_d_t_fmt,
        tok_d_fmt,
        tok_t_fmt,
        tok_am_pm,
        tok_t_fmt_ampm,
        tok_era,
        tok_era_d_fmt,
        tok_era_t_fmt,
        tok_era_d_t_fmt, 
        tok_alt_digits,
        // LC_MESSAGES-specific tokens
        tok_yesexpr,
        tok_noexpr,
        // LC_ADDRESS-specific tokens
        // LC_IDENTIFICATION-specific tokens
        // LC_MEASUREMENT-specific tokens
        // LC_NAME-specific tokens
        // LC_PAPER-specific tokens
        // LC_TELEPHONE-specific tokens
        // other:
        tok_sym_name,        // symbolic character name
        tok_char_value,      // character value (octal, decimal, or hex)
        tok_comment,         // comment
        tok_comment_char,    // <comment_char>
        tok_escape_char,     // <escape_char>
        tok_copy,            // copy directive
        tok_include,         // include directive
        tok_nl,              // newline
        tok_ndef,            // unknown/undefined token
        tok_end_tokens       // end of input
    };

    // scanner states
    // enum {valid, invalid};

    // a structure that represents a token
    struct token_t {
        std::string name;
        token_id    token;

        // file position 
        int  line;
        int  column;

        // file name pointer
        const char* file;
    };

    // realization
    Scanner ();
    virtual ~Scanner();

    // public interface
    token_t next_token ();

    void open (std::string, char = '#', char = '\\');

    void close ();

    char escape_char () const;

    void ignore_line ();

    // converts an octal, decimal, or hexadecimal escape sequence
    // (or a multibyte sequence of such things) to a numeric value
    unsigned long
    convert_escape (const char*, const char** = 0, bool = false) const;

private:

    Scanner (const Scanner&);          // not defined
    void operator= (const Scanner&);   // not defined

    // helper function that identifies a token from a string and 
    // returns a new token_t object
    token_id process_token (const char* name);

    // read a line from stream
    void read_line ();

    // current file context and stack of context objects
    ScannerContext*             context_;
    std::stack<ScannerContext*> context_stack_;

    unsigned nlines_;    // number of lines read
    unsigned ntokens_;   // number of tokens read

    // was the last token an escaped newline
    bool escaped_newline_;
};


#endif   // RWSTD_UTIL_SCANNER_H_INCLUDED