/*
 * LpLexer.java
 *
 * Copyright (C) 2006 - 2007 Martin Slota
 *
 * This program is free software; you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
 * Foundation; either version 2 of the License, or (at your option) any later
 * version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program; if not, write to the Free Software Foundation, Inc., 51
 * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */

/*
 * History:
 * v0.1 (2006-12-27): taken from propositional EVOLP implementation
 * v0.2 (2007-01-01):
 * - former documentation checked
 * - tests adopted
 * - assertions added
 * v0.2.1 (2007-01-02):
 * - more tests added (see LexerTest)
 * v0.2.2 (2007-01-12):
 * - some refactoring performed
 * - constructors modified to throw IllegalArgumentException instead of
 *   NullPointerException
 * - main method commented out
 * v0.2.3 (2007-01-15):
 * - the constructor with a String argument accepts any CharSequence now
 * v0.2.4 (2007-01-28):
 * - checked exceptions eliminated (see
 *   http://www.mindview.net/Etc/Discussions/CheckedExceptions)
 * - documentation updated
 * v0.2.5 (2007-02-08):
 * - made reusable (setInput instead of constructors)
 * - documentation updated
 * v0.2.6 (2007-03-05):
 * - initialize made protected (needed to be overridden by LpLookaheadLexer)
 * 1.0.0 (2007-05-04):
 * - promoted to version 1.0.0 :o)
 */

package lp.parse;

import java.io.BufferedReader;
import java.io.Closeable;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;

import static lp.parse.LpTokenType.*;
import lp.util.ExceptionAdapter;

/**
 * Class that tokenizes a textual input. The {@link #nextToken()} method reads
 * tokens as they appear in the input. {@code get*} methods
 * ({@link #getTokenType}, {@link #getLexem()}, {@link #getLineNumber()},
 * {@link #getPosition()} and {@link #getToken()} return information
 * relevant to the last token read. All whitespace (as defined by
 * {@link Character#isWhitespace(char)}) and line comments (parts of input after
 * a '%' character until the next line break or input end) are ignored, i.e.
 * they are not used to generate any tokens. 9 types of tokens are recognized:
 * <ol>
 * <li>{@link LpTokenType#LEFT_PAREN} -- a left parenthesis '('</li>
 * <li>{@link LpTokenType#RIGHT_PAREN} -- a right parenthesis ')'</li>
 * <li>{@link LpTokenType#COMMA} -- a comma ','</li>
 * <li>{@link LpTokenType#DOT} -- a dot '.'</li>
 * <li>{@link LpTokenType#RULE_ARROW} -- a string "&lt;-" or a string ":-"</li>
 * <li>{@link LpTokenType#LOWERCASE_WORD} is a string of characters from the set
 * {'_', 'a', 'b', ..., 'z', 'A', 'B', ..., 'Z', '0', '1', ..., '9'} not
 * beginning with an uppercase letter. In other words
 * {@code ([_a-z0-9][_a-zA-Z0-9]*)}. The token is parsed greedily -- it ends
 * only in case the next character does not belong to the set mentioned above,
 * even if it's whitespace or a beginning of a comment.</li>
 * <li>{@link LpTokenType#UPPERCASE_WORD} is a string of characters from the set
 * {'_', 'a', 'b', ..., 'z', 'A', 'B', ..., 'Z', '0', '1', ..., '9'} beginning
 * with an uppercase letter. In other words {@code ([A-Z][_a-zA-Z0-9]*)}.</li>
 * <li>{@link LpTokenType#EOF} is returned when the end of input is happily
 * reached and also ever after.</li>
 * <li>{@link LpTokenType#UNKNOWN_CHAR} is returned if a character occurs that
 * couldn't be matched against any other token (just to be precise, it is none
 * of the following: whitespace, part of an inline comment, '(', ')', ',', '.',
 * a '&lt;' of ':" followed by a '-', '_', lower- or uppercase letter). After
 * this token is returned by {@link #nextToken()}, {@link #getLexem()} returns a
 * string of length 1 with the alien character.</li>
 * </ol>
 *
 * Example: If you execute this code:
 *
 * <pre>
 *LpLexer l = new LpLexer();
 *l.setInput("Simple, short sentence.");
 *l.nextToken();
 *LpTokenType t = l.getTokenType();
 *while (t != LpTokenType.EOF) {
 *    System.out.println("token: " + t.toString() + "; lexem: "
 *            + l.getLexem() + "; line number: " + l.getLineNumber()
 *            + "; position: " + l.getPosition());
 *    l.nextToken();
 *    t = l.getTokenType();
 *}
 *l.close();
 * </pre>
 *
 * you should get the following output:
 *
 * <pre>
 * token: UPPERCASE_WORD; lexem: Simple; line number: 1; position: 1
 * token: COMMA; lexem: ,; line number: 1; position: 7
 * token: LOWERCASE_WORD; lexem: short; line number: 1; position: 9
 * token: LOWERCASE_WORD; lexem: sentence; line number: 1; position: 15
 * token: DOT; lexem: .; line number: 1; position: 23
 * </pre>
 *
 * @author Martin Slota
 * @version 1.0.0
 * @see LpTokenType
 * @see LpToken
 */
public class LpLexer implements Closeable {
	/**
	 * The reader used to read the input.
	 */
	private Reader reader;
	
	/**
	 * The lookahead character. See {@link #readNewLA()}.
	 */
	private int la;
	
	/**
	 * Type of the last token read. See {@link #getTokenType()}.
	 */
	private LpTokenType type;
	
	/**
	 * A StringBuilder where the lexem corresponding to the last token read is
	 * kept. See {@link #getLexem()}.
	 */
	private final StringBuilder lexem;
	
	/**
	 * A container for the number of line on which the last token occured. See
	 * {@link #getLineNumber()} for more information on how lines are numbered.
	 */
	private int lineNumber;
	
	/**
	 * A container for the position of the last token's beginning within a line.
	 * See {@link #getPosition()}.
	 */
	private int position;
	
	/**
	 * Creates a new instance of {@code LpLexer}.
	 */
	public LpLexer() {
		reader = null;
		la = 0;
		type = null;
		lexem = new StringBuilder();
		lineNumber = 1;
		position = 0;
	}
	
	/**
	 * Sets the character input of this {@code LpLexer}. A {@link StringReader}
	 * is used to read the input character by character.
	 *
	 * Also resets information about the previously read token to the default
	 * values (as if no token was read before).
	 *
	 * @param input string with input for the {@code LpLexer}
	 * @throws IllegalArgumentException if {@code input} is {@code null}
	 */
	public void setInput(CharSequence input) {
		if (input == null)
			throw new IllegalArgumentException(
					"The source string cannot be null!");
		setInput(new StringReader(input.toString()));
	}
	
	/**
	 * Sets the contents of the given file as an input for this {@code LpLexer}.
	 * The default system encoding is used to read the contents of the file.
	 *
	 * Also resets information about the previously read token to the default
	 * values (as if no token was read before).
	 *
	 * @param file the file with input for this {@code LpLexer}
	 * @throws IOException (wrapped in an {@link lp.util.ExceptionAdapter})
	 * in case an I/O exception occurs while opening or reading the file
	 * @throws IllegalArgumentException if {@code file} is {@code null}
	 */
	public void setInput(File file) {
		if (file == null)
			throw new IllegalArgumentException(
					"The source file cannot be null!");
		try {
			setInput(new BufferedReader(new FileReader(file)));
		} catch (IOException e) {
			throw new ExceptionAdapter(e);
		}
	}
	
	/**
	 * The given character reader will be used used as input for this
	 * {@code LpLexer}.
	 *
	 * Also resets information about the previously read token to the default
	 * values (as if no token was read before).
	 *
	 * @param reader a reader with input for the LpLexer
	 * @throws IOException (wrapped in an {@link lp.util.ExceptionAdapter})
	 * in case an I/O exception occurs while reading from the {@code Reader}
	 * @throws IllegalArgumentException if {@code reader} is {@code null}
	 */
	public void setInput(Reader reader) {
		if (reader == null)
			throw new IllegalArgumentException(
					"The source Reader cannot be null!");
		this.reader = reader;
		initialize();
	}
	
	/**
	 * Reinitializes members and reads the first lookahead character.
	 *
	 * @throws IOException (wrapped in an {@link lp.util.ExceptionAdapter}) in
	 * case an I/O error occurs while reading the first lookahead character
	 */
	protected void initialize() {
		// set the members to their defaults and read the first lookahead
		// character
		type = null;
		lineNumber = 1;
		position = 0;
		lexem.setLength(0);
		la = 0;
		readNewLA();
	}
	
	/**
	 * Closes the underlying reader. If {@link #setInput(CharSequence)} or
	 * {@link #setInput(File)} was used to set the current character source,
	 * this method should be called when no more tokens are required from the
	 * source. In other cases it is up to the programmer whether she will close
	 * the {@code Reader} given to {@link #setInput(Reader)} herself or call
	 * this method.
	 *
	 * @throws IOException (wrapped in an {@link lp.util.ExceptionAdapter}) in
	 * case an I/O exception occurs while closing the underlying {@code Reader}
	 */
	public void close() {
		type = null;
		lexem.setLength(0);
		try {
			reader.close();
		} catch (IOException e) {
			throw new ExceptionAdapter(e);
		}
	}
	
	/**
	 * Reads the next token occuring on the input. More information about tokens
	 * can be found in the class description.
	 *
	 * @throws IOException (wrapped in an {@link lp.util.ExceptionAdapter}) in
	 * case an I/O exception occurs while reading the input
	 */
	public void nextToken() {
		// ignore all whitespace and comments
		while (Character.isWhitespace(la))
			readNewLA();
		while (la == '%') {
			while (la != -1 && la != '\r' && la != '\n')
				readNewLA();
			while (Character.isWhitespace(la))
				readNewLA();
		}
		
		lexem.setLength(0);
		switch (la) {
			case '(':
				appendOne();
				type = LEFT_PAREN;
				break;
			case ')':
				appendOne();
				type = RIGHT_PAREN;
				break;
			case ',':
				appendOne();
				type = COMMA;
				break;
			case '.':
				appendOne();
				type = DOT;
				break;
			case '<':
			case ':':
				appendOne();
				if (la == '-') {
					appendOne();
					type = RULE_ARROW;
				} else {
					type = UNKNOWN_CHAR;
				}
				break;
			case -1:
				type = EOF;
				break;
			default:
				if (isWordLetter((char) la)) {
					boolean upperCase = Character.isUpperCase(la);
					while (isWordLetter((char) la))
						appendOne();
					type = upperCase ? UPPERCASE_WORD : LOWERCASE_WORD;
				} else {
					appendOne();
					type = UNKNOWN_CHAR;
				}
				break;
		}
		assert type != null;
	}
	
	/**
	 * Returns the type of the last token read. This method is not meant to be
	 * called before {@link #nextToken()} is called at least once after the last
	 * {@code setInput()} call. But if such a situation occurs, {@code null} is
	 * returned. Similarily, if {@link #close()} has already been called,
	 * {@code null} is returned.
	 *
	 * @return type of the last token read
	 */
	public LpTokenType getTokenType() {
		return type;
	}
	
	/**
	 * Returns the lexem corresponding to the last token read. In case it is a
	 * {@link LpTokenType#EOF} token, empty string is returned. This method is
	 * not meant to be called before {@link #nextToken()} is called  at least
	 * once after the last {@code setInput()} call. But if such a situation
	 * occurs, {@code null} is returned. Similarily, if {@link #close()} has
	 * already been called, {@code null} is returned.
	 *
	 * @return lexem corresponding to the last token read
	 */
	public String getLexem() {
		if (type != null) {
			String result = lexem.toString();
			assert type == EOF || lexem.length() > 0;
			switch (type) {
				case LEFT_PAREN:
					assert "(".equals(result);
					break;
				case RIGHT_PAREN:
					assert ")".equals(result);
					break;
				case COMMA:
					assert ",".equals(result);
					break;
				case DOT:
					assert ".".equals(result);
					break;
				case RULE_ARROW:
					assert "<-".equals(result) || ":-".equals(result);
					break;
				case LOWERCASE_WORD:
					for (int i = 0; i < result.length(); i++) {
						assert isWordLetter(result.charAt(i));
					}
					assert !Character.isUpperCase(result.charAt(0));
					break;
				case UPPERCASE_WORD:
					for (int i = 0; i < result.length(); i++) {
						assert isWordLetter(result.charAt(i));
					}
					assert Character.isUpperCase(result.charAt(0));
					break;
				case EOF:
					assert "".equals(result);
					break;
				case UNKNOWN_CHAR:
					assert result.length() == 1;
					break;
			}
			return result;
		}
		return null;
	}
	
	/**
	 * Returns the number of line of input on which the last token occured.
	 * Lines are numbered from 1 (see the example in class description). A
	 * newline starts when either a '\n' or a '\r' character is detected. There
	 * is one exception: a '\n' character occuring right after a '\r' character
	 * is ignored, i.e. not considered to be another line delimiter.
	 *
	 * This method is not meant to be called before {@link #nextToken()} is
	 * called  at least once after the last {@code setInput()} call. But if such
	 * a situation occurs, -1 is returned. Similarily, if {@link #close()} has
	 * already been called, -1 is returned.
	 *
	 * @return the number of line of input on which the last token occured
	 */
	public int getLineNumber() {
		if (type != null) {
			assert lineNumber > 0;
			return lineNumber;
		}
		return -1;
	}
	
	/**
	 * Returns the position of the last token's beginning within the line of
	 * input it's on. The characters on the line are numbered from 1 (see the
	 * example in class description). Tabs also count as 1 character.
	 *
	 * This method is not meant to be called before {@link #nextToken()} is
	 * called at least once after the last {@code setInput()} call. But if such
	 * a situation occurs, -1 is returned. Similarily, if {@link #close()} has
	 * already been called, -1 is returned.
	 *
	 * @return position of the last token's beginning within a line of input
	 */
	public int getPosition() {
		if (type != null) {
			assert position > 0;
			return position - lexem.length();
		}
		return -1;
	}
	
	/**
	 * Returns a {@link LpToken} instance containing information about the last
	 * token read. The information is read using the {@link #getTokenType()},
	 * {@link #getLexem()}, {@link #getPosition()} and {@link #getLineNumber()}
	 * methods.
	 *
	 * @return a {@link LpToken} instance containing information about the last
	 * token read
	 */
	public LpToken getToken() {
		if (type != null)
			return new LpToken(
					getTokenType(), getLexem(), getLineNumber(), getPosition());
		return null;
	}
	
	/**
	 * Reads one character from the input and stores it in the lookahead
	 * container {@link #la}. Updates {@link #lineNumber} and {@link #position}.
	 *
	 * @throws IOException (wrapped in an {@link lp.util.ExceptionAdapter}) in
	 * case an I/O exception occurs while reading the character
	 */
	private void readNewLA() {
		try {
			if (la != -1) {
				if (la == '\r') {
					lineNumber++;
					position = 1;
					la = reader.read();
					if (la == '\n')
						la = reader.read();
				} else if (la == '\n') {
					lineNumber++;
					position = 1;
					la = reader.read();
				} else {
					position++;
					la = reader.read();
				}
			}
		} catch (IOException e) {
			new ExceptionAdapter(e);
		}
	}
	
	/**
	 * Appends the current lookahead character to {@link #lexem} and reads a new
	 * one.
	 *
	 * @throws IOException (wrapped in an {@link lp.util.ExceptionAdapter}) in
	 * case an I/O exception occurs while reading the the new lookahead
	 * character
	 */
	private void appendOne() {
		lexem.append((char) la);
		readNewLA();
	}
	
	/**
	 * Determines if a character belongs to the set {'_', 'a', 'b', ..., 'z',
	 * 'A', 'B', ..., 'Z', '0', '1', ..., '9'}.
	 *
	 * @param c the character in question
	 * @return {@code true} if it does belong to the set mentioned above,
	 * {@code false} otherwise.
	 */
	private boolean isWordLetter(char c) {
		return ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
		|| (c >= '0' && c <= '9') || c == '_');
	}
	
	/**
	 * Main method containing the example code from class description.
	 *
	 * @param args input arguments (ignored)
	 */
	/*public static void main(String[] args) {
		LpLexer l = new LpLexer();
		l.setInput("Simple, short sentence.");
		l.nextToken();
		LpTokenType t = l.getTokenType();
		while (t != LpTokenType.EOF) {
			System.out.println("token: " + t.toString() + "; lexem: "
					+ l.getLexem() + "; line number: " + l.getLineNumber()
					+ "; position: " + l.getPosition());
			l.nextToken();
			t = l.getTokenType();
		}
		l.close();
	}/**/
}