//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This class implements the lexer for assembly files. // //===----------------------------------------------------------------------===// #include "llvm/MC/MCParser/AsmLexer.h" #include "llvm/Support/SMLoc.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/MC/MCAsmInfo.h" #include <cctype> #include <cerrno> #include <cstdio> #include <cstdlib> using namespace llvm; AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI) { CurBuf = NULL; CurPtr = NULL; isAtStartOfLine = true; } AsmLexer::~AsmLexer() { } void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) { CurBuf = buf; if (ptr) CurPtr = ptr; else CurPtr = CurBuf->getBufferStart(); TokStart = 0; } /// ReturnError - Set the error to the specified string at the specified /// location. This is defined to always return AsmToken::Error. AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { SetError(SMLoc::getFromPointer(Loc), Msg); return AsmToken(AsmToken::Error, StringRef(Loc, 0)); } int AsmLexer::getNextChar() { char CurChar = *CurPtr++; switch (CurChar) { default: return (unsigned char)CurChar; case 0: // A nul character in the stream is either the end of the current buffer or // a random nul in the file. Disambiguate that here. if (CurPtr-1 != CurBuf->getBufferEnd()) return 0; // Just whitespace. // Otherwise, return end of file. --CurPtr; // Another call to lex will return EOF again. return EOF; } } /// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)? /// /// The leading integral digit sequence and dot should have already been /// consumed, some or all of the fractional digit sequence *can* have been /// consumed. AsmToken AsmLexer::LexFloatLiteral() { // Skip the fractional digit sequence. while (isdigit(*CurPtr)) ++CurPtr; // Check for exponent; we intentionally accept a slighlty wider set of // literals here and rely on the upstream client to reject invalid ones (e.g., // "1e+"). if (*CurPtr == 'e' || *CurPtr == 'E') { ++CurPtr; if (*CurPtr == '-' || *CurPtr == '+') ++CurPtr; while (isdigit(*CurPtr)) ++CurPtr; } return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); } /// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* static bool IsIdentifierChar(char c) { return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@'; } AsmToken AsmLexer::LexIdentifier() { // Check for floating point literals. if (CurPtr[-1] == '.' && isdigit(*CurPtr)) { // Disambiguate a .1243foo identifier from a floating literal. while (isdigit(*CurPtr)) ++CurPtr; if (*CurPtr == 'e' || *CurPtr == 'E' || !IsIdentifierChar(*CurPtr)) return LexFloatLiteral(); } while (IsIdentifierChar(*CurPtr)) ++CurPtr; // Handle . as a special case. if (CurPtr == TokStart+1 && TokStart[0] == '.') return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); } /// LexSlash: Slash: / /// C-Style Comment: /* ... */ AsmToken AsmLexer::LexSlash() { switch (*CurPtr) { case '*': break; // C style comment. case '/': return ++CurPtr, LexLineComment(); default: return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1)); } // C Style comment. ++CurPtr; // skip the star. while (1) { int CurChar = getNextChar(); switch (CurChar) { case EOF: return ReturnError(TokStart, "unterminated comment"); case '*': // End of the comment? if (CurPtr[0] != '/') break; ++CurPtr; // End the */. return LexToken(); } } } /// LexLineComment: Comment: #[^\n]* /// : //[^\n]* AsmToken AsmLexer::LexLineComment() { // FIXME: This is broken if we happen to a comment at the end of a file, which // was .included, and which doesn't end with a newline. int CurChar = getNextChar(); while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) CurChar = getNextChar(); if (CurChar == EOF) return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0)); return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0)); } static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { if (CurPtr[0] == 'L' && CurPtr[1] == 'L') CurPtr += 2; if (CurPtr[0] == 'U' && CurPtr[1] == 'L' && CurPtr[2] == 'L') CurPtr += 3; } /// LexDigit: First character is [0-9]. /// Local Label: [0-9][:] /// Forward/Backward Label: [0-9][fb] /// Binary integer: 0b[01]+ /// Octal integer: 0[0-7]+ /// Hex integer: 0x[0-9a-fA-F]+ /// Decimal integer: [1-9][0-9]* AsmToken AsmLexer::LexDigit() { // Decimal integer: [1-9][0-9]* if (CurPtr[-1] != '0' || CurPtr[0] == '.') { while (isdigit(*CurPtr)) ++CurPtr; // Check for floating point literals. if (*CurPtr == '.' || *CurPtr == 'e') { ++CurPtr; return LexFloatLiteral(); } StringRef Result(TokStart, CurPtr - TokStart); long long Value; if (Result.getAsInteger(10, Value)) { // Allow positive values that are too large to fit into a signed 64-bit // integer, but that do fit in an unsigned one, we just convert them over. unsigned long long UValue; if (Result.getAsInteger(10, UValue)) return ReturnError(TokStart, "invalid decimal number"); Value = (long long)UValue; } // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL // suffixes on integer literals. SkipIgnoredIntegerSuffix(CurPtr); return AsmToken(AsmToken::Integer, Result, Value); } if (*CurPtr == 'b') { ++CurPtr; // See if we actually have "0b" as part of something like "jmp 0b\n" if (!isdigit(CurPtr[0])) { --CurPtr; StringRef Result(TokStart, CurPtr - TokStart); return AsmToken(AsmToken::Integer, Result, 0); } const char *NumStart = CurPtr; while (CurPtr[0] == '0' || CurPtr[0] == '1') ++CurPtr; // Requires at least one binary digit. if (CurPtr == NumStart) return ReturnError(TokStart, "invalid binary number"); StringRef Result(TokStart, CurPtr - TokStart); long long Value; if (Result.substr(2).getAsInteger(2, Value)) return ReturnError(TokStart, "invalid binary number"); // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL // suffixes on integer literals. SkipIgnoredIntegerSuffix(CurPtr); return AsmToken(AsmToken::Integer, Result, Value); } if (*CurPtr == 'x') { ++CurPtr; const char *NumStart = CurPtr; while (isxdigit(CurPtr[0])) ++CurPtr; // Requires at least one hex digit. if (CurPtr == NumStart) return ReturnError(CurPtr-2, "invalid hexadecimal number"); unsigned long long Result; if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result)) return ReturnError(TokStart, "invalid hexadecimal number"); // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL // suffixes on integer literals. SkipIgnoredIntegerSuffix(CurPtr); return AsmToken(AsmToken::Integer, StringRef(TokStart, CurPtr - TokStart), (int64_t)Result); } // Must be an octal number, it starts with 0. while (*CurPtr >= '0' && *CurPtr <= '9') ++CurPtr; StringRef Result(TokStart, CurPtr - TokStart); long long Value; if (Result.getAsInteger(8, Value)) return ReturnError(TokStart, "invalid octal number"); // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL // suffixes on integer literals. SkipIgnoredIntegerSuffix(CurPtr); return AsmToken(AsmToken::Integer, Result, Value); } /// LexSingleQuote: Integer: 'b' AsmToken AsmLexer::LexSingleQuote() { int CurChar = getNextChar(); if (CurChar == '\\') CurChar = getNextChar(); if (CurChar == EOF) return ReturnError(TokStart, "unterminated single quote"); CurChar = getNextChar(); if (CurChar != '\'') return ReturnError(TokStart, "single quote way too long"); // The idea here being that 'c' is basically just an integral // constant. StringRef Res = StringRef(TokStart,CurPtr - TokStart); long long Value; if (Res.startswith("\'\\")) { char theChar = Res[2]; switch (theChar) { default: Value = theChar; break; case '\'': Value = '\''; break; case 't': Value = '\t'; break; case 'n': Value = '\n'; break; case 'b': Value = '\b'; break; } } else Value = TokStart[1]; return AsmToken(AsmToken::Integer, Res, Value); } /// LexQuote: String: "..." AsmToken AsmLexer::LexQuote() { int CurChar = getNextChar(); // TODO: does gas allow multiline string constants? while (CurChar != '"') { if (CurChar == '\\') { // Allow \", etc. CurChar = getNextChar(); } if (CurChar == EOF) return ReturnError(TokStart, "unterminated string constant"); CurChar = getNextChar(); } return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); } StringRef AsmLexer::LexUntilEndOfStatement() { TokStart = CurPtr; while (!isAtStartOfComment(*CurPtr) && // Start of line comment. !isAtStatementSeparator(CurPtr) && // End of statement marker. *CurPtr != '\n' && *CurPtr != '\r' && (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { ++CurPtr; } return StringRef(TokStart, CurPtr-TokStart); } StringRef AsmLexer::LexUntilEndOfLine() { TokStart = CurPtr; while (*CurPtr != '\n' && *CurPtr != '\r' && (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { ++CurPtr; } return StringRef(TokStart, CurPtr-TokStart); } bool AsmLexer::isAtStartOfComment(char Char) { // FIXME: This won't work for multi-character comment indicators like "//". return Char == *MAI.getCommentString(); } bool AsmLexer::isAtStatementSeparator(const char *Ptr) { return strncmp(Ptr, MAI.getSeparatorString(), strlen(MAI.getSeparatorString())) == 0; } AsmToken AsmLexer::LexToken() { TokStart = CurPtr; // This always consumes at least one character. int CurChar = getNextChar(); if (isAtStartOfComment(CurChar)) { // If this comment starts with a '#', then return the Hash token and let // the assembler parser see if it can be parsed as a cpp line filename // comment. We do this only if we are at the start of a line. if (CurChar == '#' && isAtStartOfLine) return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); isAtStartOfLine = true; return LexLineComment(); } if (isAtStatementSeparator(TokStart)) { CurPtr += strlen(MAI.getSeparatorString()) - 1; return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, strlen(MAI.getSeparatorString()))); } // If we're missing a newline at EOF, make sure we still get an // EndOfStatement token before the Eof token. if (CurChar == EOF && !isAtStartOfLine) { isAtStartOfLine = true; return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); } isAtStartOfLine = false; switch (CurChar) { default: // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') return LexIdentifier(); // Unknown character, emit an error. return ReturnError(TokStart, "invalid character in input"); case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); case 0: case ' ': case '\t': // Ignore whitespace. return LexToken(); case '\n': // FALL THROUGH. case '\r': isAtStartOfLine = true; return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1)); case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); case '=': if (*CurPtr == '=') return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); case '|': if (*CurPtr == '|') return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); case '&': if (*CurPtr == '&') return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); case '!': if (*CurPtr == '=') return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); case '/': return LexSlash(); case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); case '\'': return LexSingleQuote(); case '"': return LexQuote(); case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return LexDigit(); case '<': switch (*CurPtr) { case '<': return ++CurPtr, AsmToken(AsmToken::LessLess, StringRef(TokStart, 2)); case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2)); case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2)); default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); } case '>': switch (*CurPtr) { case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2)); case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2)); default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); } // TODO: Quoted identifiers (objc methods etc) // local labels: [0-9][:] // Forward/backward labels: [0-9][fb] // Integers, fp constants, character constants. } }