/*
* An OpenOffice formula to LaTeX translator: the scanner
* Copyright (C) 2009
* Andreas Harnack (ah8 at freenet dot de)
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along
* with this library; see the file COPYING. If not, write to the Free
* Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307,
* USA.
* As a special exception, you may use this file as part of a free software
* library without restriction. Specifically, if other files instantiate
* templates or use macros or inline functions from this file, or you compile
* this file and link it with other files to produce an executable, this
* file does not by itself cause the resulting executable to be covered by
* the GNU General Public License. This exception does not however
* invalidate any other reasons why the executable file might be covered by
* the GNU General Public License.
*/
#include "scanner.h"
using namespace ooMath;
struct Scanner::TokenTableEntry
{
const char* ident;
TokenType type;
TokenGroup group;
int level;
};
Scanner::TokenTableEntry const* Scanner::lookup(Scanner::TokenBuffer const& id)
{
static const TokenTableEntry tokenTbl[] =
{
{ "abs", ABS, GUNOPER, 13 },
{ "acute", ACUTE, GATTRIBUT, 5 },
{ "aleph", ALEPH, GSTANDALONE, 5 },
{ "alignb", ALIGNC, GALIGN|GDISCARDED, 0 },
{ "alignc", ALIGNC, GALIGN, 0 },
{ "alignl", ALIGNL, GALIGN, 0 },
{ "alignm", ALIGNC, GALIGN|GDISCARDED, 0 },
{ "alignr", ALIGNR, GALIGN, 0 },
{ "alignt", ALIGNC, GALIGN|GDISCARDED, 0 },
{ "and", AND, GPRODUCT, 0 },
{ "approx", APPROX, GRELATION, 0 },
{ "arccos", ACOS, GFUNCTION, 5 },
{ "arccot", ACOT, GFUNCTION, 5 },
{ "arcosh", ACOSH, GFUNCTION, 5 },
{ "arcoth", ACOTH, GFUNCTION, 5 },
{ "arcsin", ASIN, GFUNCTION, 5 },
{ "arctan", ATAN, GFUNCTION, 5 },
{ "arsinh", ASINH, GFUNCTION, 5 },
{ "artanh", ATANH, GFUNCTION, 5 },
{ "backepsilon", BACKEPSILON, GSTANDALONE, 5 },
{ "bar", BAR, GATTRIBUT, 5 },
{ "binom", BINOM, GNONE, 5 },
{ "black", BLACK, GCOLOR, 0 },
{ "blue", BLUE, GCOLOR, 0 },
{ "bold", BOLD, GFONTATTR, 5 },
{ "boper", BOPER, GPRODUCT, 0 },
{ "breve", BREVE, GATTRIBUT, 5 },
{ "bslash", BACKSLASH, GPRODUCT, 0 },
{ "cdot", CDOT, GPRODUCT, 0 },
{ "check", CHECK, GATTRIBUT, 5 },
{ "circ", CIRC, GSTANDALONE, 5 },
{ "circle", CIRCLE, GATTRIBUT, 5 },
{ "color", COLOR, GFONTATTR, 5 },
{ "coprod", COPROD, GOPER, 5 },
{ "cos", COS, GFUNCTION, 5 },
{ "cosh", COSH, GFUNCTION, 5 },
{ "cot", COT, GFUNCTION, 5 },
{ "coth", COTH, GFUNCTION, 5 },
{ "csub", CSUB, GPOWER, 0 },
{ "csup", CSUP, GPOWER, 0 },
{ "cyan", CYAN, GCOLOR, 0 },
{ "dddot", DDDOT, GATTRIBUT, 5 },
{ "ddot", DDOT, GATTRIBUT, 5 },
{ "def", DEF, GRELATION, 0 },
{ "div", DIV, GPRODUCT, 0 },
{ "divides", DIVIDES, GRELATION, 0 },
{ "dlarrow", DLARROW, GSTANDALONE, 5 },
{ "dlrarrow", DLRARROW, GSTANDALONE, 5 },
{ "dot", DOT, GATTRIBUT, 5 },
{ "dotsaxis", DOTSAXIS, GSTANDALONE, 5 },
{ "dotsdiag", DOTSDIAG, GSTANDALONE, 5 },
{ "dotsdown", DOTSDOWN, GSTANDALONE, 5 },
{ "dotslow", DOTSLOW, GSTANDALONE, 5 },
{ "dotsup", DOTSUP, GSTANDALONE, 5 },
{ "dotsvert", DOTSVERT, GSTANDALONE, 5 },
{ "downarrow", DOWNARROW, GSTANDALONE, 5 },
{ "drarrow", DRARROW, GSTANDALONE, 5 },
{ "emptyset", EMPTYSET, GSTANDALONE, 5 },
{ "equiv", EQUIV, GRELATION, 0 },
{ "exists", EXISTS, GSTANDALONE, 5 },
{ "exp", EXP, GFUNCTION, 5 },
{ "fact", FACT, GUNOPER, 5 },
{ "fixed", FIXED, GFONT, 0 },
{ "font", FONT, GFONTATTR, 5 },
{ "forall", FORALL, GSTANDALONE, 5 },
{ "from", FROM, GLIMIT, 0 },
{ "func", FUNC, GFUNCTION, 5 },
{ "ge", GE, GRELATION, 0 },
{ "geslant", GESLANT, GRELATION, 0 },
{ "gg", GG, GRELATION, 0 },
{ "grave", GRAVE, GATTRIBUT, 5 },
{ "green", GREEN, GCOLOR, 0 },
{ "gt", GT, GRELATION, 0 },
{ "hat", HAT, GATTRIBUT, 5 },
{ "hbar", HBAR, GSTANDALONE, 5 },
{ "iiint", IIINT, GOPER, 5 },
{ "iint", IINT, GOPER, 5 },
{ "im", IM, GSTANDALONE, 5 },
{ "in", IN, GRELATION, 0 },
{ "infinity", INFINITY, GSTANDALONE, 5 },
{ "infty", INFINITY, GSTANDALONE, 5 },
{ "int", INT, GOPER, 5 },
{ "intersection", INTERSECT, GPRODUCT, 0 },
{ "ital", ITALIC, GFONTATTR, 5 },
{ "italic", ITALIC, GFONTATTR, 5 },
{ "lambdabar", LAMBDABAR, GSTANDALONE, 5 },
{ "langle", LANGLE, GLBRACES, 5 },
{ "lbrace", LBRACE, GLBRACES, 5 },
{ "lceil", LCEIL, GLBRACES, 5 },
{ "ldbracket", LDBRACKET, GLBRACES, 5 },
{ "ldline", LDLINE, GLBRACES, 5 },
{ "le", LE, GRELATION, 0 },
{ "left", LEFT, GNONE, 5 },
{ "leftarrow", LEFTARROW, GSTANDALONE, 5 },
{ "leslant", LESLANT, GRELATION, 0 },
{ "lfloor", LFLOOR, GLBRACES, 5 },
{ "lim", LIM, GOPER, 5 },
{ "liminf", LIMINF, GOPER, 5 },
{ "limsup", LIMSUP, GOPER, 5 },
{ "lint", LINT, GOPER, 5 },
{ "ll", LL, GRELATION, 0 },
{ "lline", LLINE, GLBRACES, 5 },
{ "llint", LLINT, GOPER, 5 },
{ "lllint", LLLINT, GOPER, 5 },
{ "ln", LN, GFUNCTION, 5 },
{ "log", LOG, GFUNCTION, 5 },
{ "lsub", LSUB, GPOWER, 0 },
{ "lsup", LSUP, GPOWER, 0 },
{ "lt", LT, GRELATION, 0 },
{ "magenta", MAGENTA, GCOLOR, 0 },
{ "matrix", MATRIX, GNONE, 5 },
{ "minusplus", MINUSPLUS, GOPER|GSUM, 5 },
{ "mline", MLINE, GNONE, 0 },
{ "mz23", DEBG, GATTRIBUT, 0 },
{ "nabla", NABLA, GSTANDALONE, 5 },
{ "nbold", NBOLD, GFONTATTR, 5 },
{ "ndivides", NDIVIDES, GRELATION, 0 },
{ "neg", NEG, GOPER, 5 },
{ "neq", NEQ, GRELATION, 0 },
{ "newline", NEWLINE, GNONE, 0 },
{ "ni", NI, GRELATION, 0 },
{ "nitalic", NITALIC, GFONTATTR, 5 },
{ "none", NONE, GLBRACES|GRBRACES, 0 },
{ "notin", NOTIN, GRELATION, 0 },
{ "nroot", NROOT, GUNOPER, 5 },
{ "nsubset", NSUBSET, GRELATION, 0 },
{ "nsubseteq", NSUBSETEQ, GRELATION, 0 },
{ "nsupset", NSUPSET, GRELATION, 0 },
{ "nsupseteq", NSUPSETEQ, GRELATION, 0 },
{ "odivide", ODIVIDE, GPRODUCT, 0 },
{ "odot", ODOT, GPRODUCT, 0 },
{ "ominus", OMINUS, GSUM, 0 },
{ "oper", OPER, GOPER, 5 },
{ "oplus", OPLUS, GSUM, 0 },
{ "or", OR, GSUM, 0 },
{ "ortho", ORTHO, GRELATION, 0 },
{ "otimes", OTIMES, GPRODUCT, 0 },
{ "over", OVER, GPRODUCT, 0 },
{ "overbrace", OVERBRACE, GPRODUCT, 5 },
{ "overline", OVERLINE, GATTRIBUT, 5 },
{ "overstrike", OVERSTRIKE, GATTRIBUT, 5 },
{ "owns", NI, GRELATION, 0 },
{ "parallel", PARALLEL, GRELATION, 0 },
{ "partial", PARTIAL, GSTANDALONE, 5 },
{ "phantom", PHANTOM, GFONTATTR, 5 },
{ "plusminus", PLUSMINUS, GOPER|GSUM, 5 },
{ "prod", PROD, GOPER, 5 },
{ "prop", PROP, GRELATION, 0 },
{ "rangle", RANGLE, GRBRACES, 0 },
{ "rbrace", RBRACE, GRBRACES, 0 },
{ "rceil", RCEIL, GRBRACES, 0 },
{ "rdbracket", RDBRACKET, GRBRACES, 0 },
{ "rdline", RDLINE, GRBRACES, 0 },
{ "re", RE, GSTANDALONE, 5 },
{ "red", RED, GCOLOR, 0 },
{ "rfloor", RFLOOR, GRBRACES, 0 },
{ "right", RIGHT, GNONE, 0 },
{ "rightarrow", RIGHTARROW, GSTANDALONE, 5 },
{ "rline", RLINE, GRBRACES, 0 },
{ "rsub", RSUB, GPOWER, 0 },
{ "rsup", RSUP, GPOWER, 0 },
{ "sans", SANS, GFONT, 0 },
{ "serif", SERIF, GFONT, 0 },
{ "setc", SETC, GSTANDALONE, 5 },
{ "setminus", BACKSLASH, GPRODUCT, 0 },
{ "setn", SETN, GSTANDALONE, 5 },
{ "setq", SETQ, GSTANDALONE, 5 },
{ "setr", SETR, GSTANDALONE, 5 },
{ "setz", SETZ, GSTANDALONE, 5 },
{ "sim", SIM, GRELATION, 0 },
{ "simeq", SIMEQ, GRELATION, 0 },
{ "sin", SIN, GFUNCTION, 5 },
{ "sinh", SINH, GFUNCTION, 5 },
{ "size", SIZE, GFONTATTR, 5 },
{ "slash", SLASH, GPRODUCT, 0 },
{ "sqrt", SQRT, GUNOPER, 5 },
{ "stack", STACK, GNONE, 5 },
{ "sub", RSUB, GPOWER, 0 },
{ "subset", SUBSET, GRELATION, 0 },
{ "subseteq", SUBSETEQ, GRELATION, 0 },
{ "sum", SUM, GOPER, 5 },
{ "sup", RSUP, GPOWER, 0 },
{ "supset", SUPSET, GRELATION, 0 },
{ "supseteq", SUPSETEQ, GRELATION, 0 },
{ "tan", TAN, GFUNCTION, 5 },
{ "tanh", TANH, GFUNCTION, 5 },
{ "tilde", TILDE, GATTRIBUT, 5 },
{ "times", TIMES, GPRODUCT, 0 },
{ "to", TO, GLIMIT, 0 },
{ "toward", TOWARD, GRELATION, 0 },
{ "transl", TRANSL, GRELATION, 0 },
{ "transr", TRANSR, GRELATION, 0 },
{ "underbrace", UNDERBRACE, GPRODUCT, 5 },
{ "underline", UNDERLINE, GATTRIBUT, 5 },
{ "union", UNION, GSUM, 0 },
{ "uoper", UOPER, GUNOPER, 5 },
{ "uparrow", UPARROW, GSTANDALONE, 5 },
{ "vec", VEC, GATTRIBUT, 5 },
{ "white", WHITE, GCOLOR, 0 },
{ "widebslash", WIDEBACKSLASH, GPRODUCT, 0 },
{ "widehat", WIDEHAT, GATTRIBUT, 5 },
{ "wideslash", WIDESLASH, GPRODUCT, 0 },
{ "widetilde", WIDETILDE, GATTRIBUT, 5 },
{ "widevec", WIDEVEC, GATTRIBUT, 5 },
{ "wp", WP, GSTANDALONE, 5 },
{ "yellow", YELLOW, GCOLOR, 0 }
};
TokenTableEntry const *first = tokenTbl;
int size = sizeof(tokenTbl)/sizeof(*tokenTbl);
while ( size > 0 ) {
int cmp, half = size >> 1;
TokenTableEntry const *middle = first + half;
if ( (cmp = id.comp(middle->ident)) == 0 )
return middle;
else if ( cmp < 0 )
first = middle + 1, size = size - half - 1;
else
size = half;
}
return 0;
}
void Scanner::undo(CharType ch) {
undoBuffer.push_back(ch);
--col;
}
Scanner::CharType Scanner::next()
{
CharType ch;
if ( undoBuffer.empty() ) {
ch = nextChar();
}
else {
ch = undoBuffer.back();
undoBuffer.pop_back();
}
++col;
return ch;
}
Token Scanner::nextToken()
{
CharType ch = next();
tokenBuffer.clear();
// skip spaces
while ( 1 ) {
if ( ch == '\n' )
++row, col=0, ch = next();
else if ( isspace(ch) )
ch = next();
else if ( ch == '%' ) {
// check for comment
if ( (ch = next()) == '%' ) {
while ( (ch = next()) != '\n' ) {
if ( ch == eof )
return token(END, GNONE, 0,"");
}
++row, col=0, ch = next();
}
else {
undo(ch); ch = '%';
break;
}
}
else
break;
}
if ( ch == eof ) {
return token(END, GNONE, 0, "");
}
else if ( isletter(ch) ) {
// identifier
TokenTableEntry const *e;
do tokenBuffer.add(ch); while ( isalpha((ch=next())) );
undo(ch);
if ( (e = lookup(tokenBuffer)) )
return token(e->type, e->group, e->level, e->ident);
else
return token(IDENT, GNONE, 5, "<identifier>");
}
else if ( isnumber(ch) ) {
// number
do tokenBuffer.add(ch); while ( isnumber((ch=next())) );
if ( ch == '.' )
do tokenBuffer.add(ch); while ( isnumber((ch=next())) );
undo(ch);
return token(NUMBER, GNONE, 5, "<number>");
}
else if ( ch == '%' ) {
tokenBuffer.add(ch);
if ( isletter(ch = next()) ) {
// user-defined character
do tokenBuffer.add(ch); while ( isalpha((ch=next())) );
undo(ch);
return token(SPECIAL, GNONE, 5, "<special>");
}
undo(ch);
return token(TEXT, GNONE, 5, "<text>");
}
else if ( ch == '.' ) {
if ( isnumber(ch = next()) ) {
// number starting with .
tokenBuffer.add('.').add(ch);
do tokenBuffer.add(ch); while ( isnumber((ch=next())) );
undo(ch);
return token(NUMBER, GNONE, 5, "<number>");
}
return token(POINT, GNONE, 0, ".");
}
else if ( ch == '"' ) {
// double quoted string
while ( (ch=next()) != '"' ) {
if ( ch == '\\' )
switch ( (ch = next()) ) {
case '\\':
case '"':
tokenBuffer.add(ch);
break;
default:
undo(ch);
}
else
tokenBuffer.add(ch);
}
return token(TEXT, GNONE, 5, "<string>");
}
else switch ( ch )
{
case '_':
return token(RSUB, GPOWER, 0, "_");
case '<':
switch ( (ch = next()) )
{
case '>':
return token(NEQ, GRELATION, 0, "<>");
case '=':
return token(LE, GRELATION, 0, "<=");
case '<':
return token(LL, GRELATION, 0, "<<");
default:
undo(ch);
return token(LT, GRELATION, 0, "<");
}
break;
case '>':
switch ( (ch = next()) )
{
case '=':
return token(GE, GRELATION, 0, ">=");
case '>':
return token(GG, GRELATION, 0, ">>" );
default:
undo(ch);
return token(GT, GRELATION, 0, ">");
}
break;
case '[':
return token(LBRACKET, GLBRACES, 5, "[");
case '\\':
return token(ESCAPE, GNONE, 5, "\\");
case ']':
return token(RBRACKET, GRBRACES, 0, "]");
case '^':
return token(RSUP, GPOWER, 0, "^");
case '`':
return token(SBLANK, GBLANK, 5, "`");
case '{':
return token(LGROUP, GNONE, 5, "{");
case '|':
return token(OR, GSUM, 0, "|");
case '}':
return token(RGROUP, GNONE, 0, "}");
case '~':
return token(BLANK, GBLANK, 5, "~");
case '#':
if ( (ch = next()) == '#' )
return token(DPOUND, GNONE, 0, "##");
else {
undo(ch);
return token(POUND, GNONE, 0, "#");
}
break;
case '&':
return token(AND, GPRODUCT, 0, "&");
case '(':
return token(LPARENT, GLBRACES, 5, "(");
case ')':
return token(RPARENT, GRBRACES, 0, ")");
case '*':
return token(MULTIPLY, GPRODUCT, 0, "*");
case '+':
if ( (ch = next()) == '-' )
return token(PLUSMINUS, GOPER|GSUM, 5, "+-");
else {
undo(ch);
return token(PLUS, GOPER|GSUM, 5, "+");
}
break;
case '-':
if ( (ch = next()) == '+' )
return token(MINUSPLUS, GOPER|GSUM, 5, "-+");
else {
undo(ch);
return token(MINUS, GOPER|GSUM, 5, "-");
}
break;
case '/':
return token(DIVIDEBY, GPRODUCT, 0, "/");
case '=':
return token(ASSIGN, GRELATION, 0, "=");
default:
break;
}
tokenBuffer.add(ch);
return token(CHARACTER, GNONE, 5, "<char>");
}