#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include "util.h"

#include "regex.hh"

#include <ostream>
#include <iomanip>
#include <queue>
#include <stdexcept>
#include <ostream>

namespace {
    enum { CHAR_ANY=INT16_MIN };
    typedef int16_t char_t;
    typedef std::set<char_t> char_set;
    typedef std::multimap<char_t, regex_state *> state_map;
    typedef std::set<regex_state *> state_set;
};

struct regex_state {
    size_t N;
    state_map map; // arcs leading from this state
    bool accepting;

    regex_state(size_t N_)
        : N(N_), map(), accepting(false)
    {}

    void insert_states(char_t ch, state_set &set);

private:
    regex_state(const regex_state &);
    const regex_state& operator=(const regex_state &);
};

void regex_state::insert_states(char_t ch, state_set &set)
{
    std::pair<state_map::const_iterator, state_map::const_iterator> i;
    for(i=map.equal_range(ch); i.first!=i.second; ++i.first)
        set.insert(i.first->second);
}

namespace {
    using std::string;
    using std::vector;
    using std::set;
    using std::multimap;
    using std::pair;
    using std::auto_ptr;

    class NFA {
        class param {
            NFA *nfa;
            state_set states;
            typedef pair<char_t, regex_state *> arc_data;
            vector<arc_data> arcs;
            bool matches_nothing;

            param(const param &);
            const param& operator=(const param &);
        public:
            param(NFA *nfa_, bool matches_nothing_)
                : nfa(nfa_), states(), arcs(), matches_nothing(matches_nothing_)
            {}

            void add_arc(char_t ch, regex_state *st)
            { arcs.push_back(arc_data(ch, st)); }
            template<typename InputIterator>
                void apply_arcs(InputIterator first, const InputIterator &last) const;

            void add_state(regex_state *st)
            { states.insert(st); }
            const state_set& get_states()
            { return states; }

            template<typename InputIterator>
                void add_state_arc(regex_state *st,
                        InputIterator first, const InputIterator &last);

            bool get_matches_nothing() const
            { return matches_nothing;}
            void set_matches_nothing(bool matches_nothing_)
            { matches_nothing=matches_nothing_; }

            const param& operator+=(const param &r);
            const param& operator*=(const param &r);
            void iterate();
        };

        uint32_t flags;

        regex_state* new_state();

        auto_ptr<param> parse_alternation(const string &str, string::size_type &i, bool first);
        auto_ptr<param> parse_concatenation(const string &str, string::size_type &i);
        auto_ptr<param> parse_iatom(const string &str, string::size_type &i);
        auto_ptr<param> parse_atom(const string &str, string::size_type &i);
        pair<size_t, size_t> parse_braces(const string &str, string::size_type &i);
        size_t parse_number(const string &str, string::size_type &i);
        auto_ptr<char_set> parse_brackets(const string &str, string::size_type &i);

        void finalize();

    public:
        vector<regex_state *> states;

        NFA(const string &str, uint32_t flags_);
        ~NFA()
        { finalize(); }
    };

    template<typename InputIterator>
        void NFA::param::apply_arcs(InputIterator first, const InputIterator &last) const
        {
            for(; first!=last; ++first) {
                for(vector<arc_data>::const_iterator j=arcs.begin(); j!=arcs.end(); ++j)
                    (*first)->map.insert(state_map::value_type(j->first, j->second));
            }
        }

    template<typename InputIterator>
        void NFA::param::add_state_arc(regex_state *st,
                InputIterator first, const InputIterator &last)
        {
            add_state(st);
            for(; first!=last; ++first)
                add_arc(*first, st);
        }

    const NFA::param& NFA::param::operator+=(const param &r)
    {
        ASSERT(nfa==r.nfa);

        states.insert(r.states.begin(), r.states.end());
        arcs.insert(arcs.end(), r.arcs.begin(), r.arcs.end());
        matches_nothing|=r.matches_nothing;
        return *this;
    }

    const NFA::param& NFA::param::operator*=(const param &r)
    {
        ASSERT(nfa==r.nfa);

        r.apply_arcs(states.begin(), states.end());
        if(r.matches_nothing)
            states.insert(r.states.begin(), r.states.end());
        else
            states=r.states;
        if(matches_nothing)
            arcs.insert(arcs.end(), r.arcs.begin(), r.arcs.end());
        matches_nothing&=r.matches_nothing;
        return *this;
    }

    void NFA::param::iterate()
    {
        apply_arcs(states.begin(), states.end());
    }

    void NFA::finalize()
    {
        for(vector<regex_state *>::const_iterator i=states.begin(); i!=states.end(); ++i)
            delete *i;
    }

    NFA::NFA(const string &str, uint32_t flags_)
        : flags(flags_), states()
    {
        try {
            auto_ptr<param> begin(new param(this, true));
            begin->add_state(new_state());

            string::size_type i=0;
            auto_ptr<param> ret(parse_alternation(str, i, true));
            if(!(flags & regex::MATCH_FULL))
                ret->add_arc(CHAR_ANY, states[0]);
            *begin *= *ret;

            for(state_set::const_iterator j=begin->get_states().begin();
                    j!=begin->get_states().end(); ++j)
                (*j)->accepting=true;
        }
        catch(...) {
            finalize();
            throw;
        }
    }

    regex_state* NFA::new_state()
    {
        states.push_back(new regex_state(states.size()));
        return *states.rbegin();
    }

    auto_ptr<NFA::param> NFA::parse_alternation(const string &str, string::size_type &i, bool first)
    {
        ASSERT(first || str[i]=='(');

        string::size_type tmpi=i;
        if(!first)
            ++i;
        auto_ptr<param> ret(parse_concatenation(str, i));
        while(i<str.size() && str[i]!=')')
            *ret += *parse_concatenation(str, ++i); //NOTE: try out i++//

        if(i>=str.size()) {
            if(first)
                return ret;
            else
                throw regex::exception_unmatched(tmpi);
        } 
        if(first) // str[i]==')'
            throw regex::exception_unmatched(i);
        ++i;
        return ret;
    }

    auto_ptr<NFA::param> NFA::parse_concatenation(const string &str, string::size_type &i)
    {
        auto_ptr<param> ret(new param(this, true));
        while(i<str.size() && str[i]!='|' && str[i]!=')') {
            auto_ptr<param> t=parse_iatom(str, i);
            *ret *= *t;
        }
        return ret;
    }

    auto_ptr<NFA::param> NFA::parse_iatom(const string &str, string::size_type &i)
    {
        string::size_type oldi=i;
        auto_ptr<param> ret(parse_atom(str, i));
        pair<size_t, size_t> interval;
        bool matches_nothing;

        if(i<str.size())
            switch(str[i]) {
                case '?':
                    ret->set_matches_nothing(true);
                    ++i;
                    break;
                case '*':
                    ret->set_matches_nothing(true);
                    /*fall through*/
                case '+':
                    ret->iterate();
                    ++i;
                    break;
                case '{':
                    interval=parse_braces(str, i);
                    if(interval.second==0) {
                        ret.reset(new param(this, true));
                        break;
                    }
                    matches_nothing=ret->get_matches_nothing();
                    if(interval.second>1) {
                        size_t j=interval.second;
                        size_t k=matches_nothing?0:std::max(interval.first, size_t(1));
                        for(; j>1; --j) {
                            string::size_type ti=oldi;
                            auto_ptr<param> t(parse_atom(str, ti));
                            if(j>k) {
                                ret->set_matches_nothing(true);
                                t->set_matches_nothing(false);
                            }
                            *ret *= *t;
                        }
                        ret->set_matches_nothing(matches_nothing);
                    }
                    if(interval.first==0)
                        ret->set_matches_nothing(true);
                    break;
            }
        return ret;
    }

    auto_ptr<NFA::param> NFA::parse_atom(const string &str, string::size_type &i)
    {
        auto_ptr<param> ret(new param(this, false));
        char_t tmpchar[1];
        auto_ptr<char_set> tmpset;

        switch(str[i]) {
            case '{': case '}': case ')': case ']':
            case '+': case '*': case '?': case '|':
                throw regex::exception_parse(i);
            case '.':
                tmpchar[0]=CHAR_ANY;
                ret->add_state_arc(new_state(), tmpchar+0, tmpchar+1);
                ++i;
                break;
            case '[':
                tmpset=parse_brackets(str, i);
                ret->add_state_arc(new_state(), tmpset->begin(), tmpset->end());
                break;
            case '(':
                ret=parse_alternation(str, i, false);
                break;
            case '\\': 
                if(i>=str.size() || (str[i+1]!='.' && str[i+1]!='[' && str[i+1]!=']' &&
                            str[i+1]!='(' && str[i+1]!=')' && str[i+1]!='*' && str[i+1]!='+' &&
                            str[i+1]!='?' && str[i+1]!='{' && str[i+1]!='}' && str[i+1]!='|' &&
                            str[i+1]!='^' && str[i+1]!='$' && str[i+1]!='\\') )
                    throw regex::exception_bad_escape(i);
                ++i;
                // fall through
            default:
                tmpchar[0]=flags&regex::MATCH_CASE_INSENSITIVE ? tolower(str[i]) : str[i];
                ret->add_state_arc(new_state(), tmpchar+0, tmpchar+1);
                ++i;
        }
        return ret;
    }

    pair<size_t, size_t> NFA::parse_braces(const string &str, string::size_type &i)
    {
        ASSERT(str[i]=='{');

        ++i;
        pair<size_t, size_t> t;
        t.first=parse_number(str, i);
        if(str[i]=='}') {
            t.second=t.first;
            ++i;
            return t;
        }

        if(str[i]!=',')
            throw regex::exception_bad_interval(i);
        if(str[++i]=='}') {
            t.second=SIZE_MAX;
            ++i;
            return t;
        }

        t.second=parse_number(str, i);
        if(str[i]!='}' || t.second<t.first)
            throw regex::exception_bad_interval(i);
        ++i;
        return t;
    }

    size_t NFA::parse_number(const string &str, string::size_type &i)
    {
        if(!isdigit(str[i]))
            throw regex::exception_bad_interval(i);
        size_t t=0;
        while(isdigit(str[i])) {
            t=t*10 + str[i]-'0';
            if(t>regex::INTERVAL_MAX)
                throw regex::exception_bad_interval(i);
            ++i;
        }
        return t;
    }

    auto_ptr<char_set> NFA::parse_brackets(const string &str, string::size_type &i)
    {
        ASSERT(str[i]=='[');
        string::size_type tmpi=i;
        ++i;

        auto_ptr<char_set> ret(new char_set());
        bool invert=false;

        try {
            if(str.at(i)=='^') {
                invert=true;
                ++i;
            }

            do {
                char_t a=str.at(i++);
                if(str.at(i)=='-' && str.at(i+1)!=']') {
                    char_t b=str.at(++i);
                    ++i;
                    char_set::iterator k=ret->end();
                    for(char_t j=b; j>=a; --j)
                        k=ret->insert(k, j);
                } else
                    ret->insert(a);
            } while(str.at(i)!=']');
            ++i;
        }
        catch(std::out_of_range &) {
            throw regex::exception_unmatched(tmpi);
        }

        if(invert) {
            auto_ptr<char_set> t(ret);
            ret.reset(new char_set());

            t->insert(t->begin(), CHAR_ANY); // sentinel

            char_set::const_reverse_iterator j=t->rbegin();
            char_set::iterator k=ret->end();
            for(int l=CHAR_MAX; l>=CHAR_MIN; --l) {
                if(l>*j)
                    k=ret->insert(k, l);
                else
                    ++j; // j is a reverse_iterator
            }
        }
        return ret;
    }

    string escape(char_t c)
    {
        static char tohex[]="0123456789ABCDEF";

        if(c=='"' || c=='\\')
            return string("\\")+char(c);
        if(c>=' ' && c<0x80)
            return string()+char(c);
        string t="0x";
        t+=tohex[c/16];
        t+=tohex[c%16];
        return t;
    }

    string build_str(const char_set &set)
    {
        string ret;
        char_t prev=CHAR_ANY;
        for(char_set::const_iterator i=set.begin(); ; ++i) {
            if(i==set.end() || *i!=prev+1) {
                if(prev!=CHAR_ANY && *ret.rbegin()!=prev) {
                    if(*ret.rbegin()+1 < prev)
                        ret+='-';
                    ret+=escape(prev);
                }
                if(i==set.end())
                    break;
                ret+=escape(*i);
            }
            prev=*i;
        }
        return ret;
    }

    void print_dot_graph(std::ostream &s, const vector<regex_state *> &v, bool dfa)
    {
        s << "digraph A {\n";
        s << "    node [shape=circle];\n\n";

        for(size_t i=0; i<v.size(); ++i)
            if(v[i]->accepting)
                s << "    " << i << " [peripheries=2];\n";
        s << "\n";


        for(size_t i=0; i<v.size(); ++i) {
            const regex_state &st=*v[i];
            state_set set_st;
            for(state_map::const_iterator j=st.map.begin(); j!=st.map.end(); ++j)
                set_st.insert(j->second);
            for(state_set::const_iterator j=set_st.begin(); j!=set_st.end(); ++j) {
                char_set set_ch;
                for(state_map::const_iterator k=st.map.begin(); k!=st.map.end(); ++k)
                    if(k->second==*j)
                        set_ch.insert(k->first);
                if(dfa && set_ch.erase(CHAR_ANY))
                    for(char_t k=0x00; k<=0xff; ++k)
                        if(!st.map.count(k))
                            set_ch.insert(k);

                string normal=set_ch.count(CHAR_ANY)?"*":build_str(set_ch);

                char_set inv;
                for(char_t k=0x00; k<=0xff; ++k)
                    if(!set_ch.count(k))
                        inv.insert(k);
                string inverse=string("^")+build_str(inv);

                s << "    " << st.N << " -> " << (*j)->N << " [label=\"" <<
                    (normal.size()<=inverse.size()?normal:inverse) << "\"];\n";
            }
            s << "\n";
        }

        s << '}' << std::endl;
    }

}


regex::regex(const std::string &str, uint32_t flags_, std::ostream *nfa_str, std::ostream *dfa_str)
    : flags(flags_), states()
{

    typedef std::map<state_set, regex_state *> Map;

    NFA nfa(str, flags);
    Map map;

    std::queue<state_set> q;
    state_set init(nfa.states.begin(), nfa.states.begin()+1);
    q.push(init);
    map.insert(Map::value_type(init, new_state()));
    while(!q.empty()) {
        state_set s=q.front();
        q.pop();

        Map::const_iterator mi=map.find(s);
        ASSERT(mi!=map.end());

        char_set c;
        for(state_set::const_iterator i=s.begin(); i!=s.end(); ++i) {
            mi->second->accepting|=(*i)->accepting;
            for(state_map::const_iterator j=(*i)->map.begin(); j!=(*i)->map.end(); ++j)
                c.insert(j->first);
        }

        if(flags&MATCH_FULL || !mi->second->accepting)
        for(char_set::const_iterator i=c.begin(); i!=c.end(); ++i) {
            state_set t;
            for(state_set::const_iterator j=s.begin(); j!=s.end(); ++j) {
                (*j)->insert_states(*i, t);
                (*j)->insert_states(CHAR_ANY, t);
            }

            Map::const_iterator mj=map.find(t);
            if(mj==map.end()) {
                q.push(t);
                mj=map.insert(Map::value_type(t, new_state())).first;
            }

            mi->second->map.insert(state_map::value_type(*i, mj->second));
        }
    }

    if(nfa_str)
        print_dot_graph(*nfa_str, nfa.states, false);
    if(dfa_str)
        print_dot_graph(*dfa_str, states, true);
}

regex::~regex()
{
    for(std::vector<regex_state *>::const_iterator i=states.begin(); i!=states.end(); ++i)
        delete *i;
}

regex_state* regex::new_state()
{
    states.push_back(new regex_state(states.size()));
    return *states.rbegin();
}

bool regex::match(const std::string &str) const
{
    const regex_state *s=states[0];
    for(std::string::const_iterator i=str.begin(); i!=str.end() &&
            (!s->accepting || flags&MATCH_FULL); i++) {
        state_map::const_iterator j=s->map.find(flags&MATCH_CASE_INSENSITIVE ? tolower(*i) : *i);
        if(j==s->map.end())
            j=s->map.find(CHAR_ANY);
        if(j==s->map.end())
            return false;
        s=j->second;
    }
    return s->accepting;
}

void regex::exception_parse::print(std::ostream &os, const std::string &str)
{
    os << what() << std::endl;
    os << str << std::endl;
    os << std::setw(pos+1) << '^' << std::endl;
}
