// Copyright 2017 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/asmjs/asm-scanner.h"
#include "src/char-predicates-inl.h"
#include "src/conversions.h"
#include "src/flags.h"
#include "src/parsing/scanner.h"
#include "src/unicode-cache.h"
namespace v8 {
namespace internal {
namespace {
// Cap number of identifiers to ensure we can assign both global and
// local ones a token id in the range of an int32_t.
static const int kMaxIdentifierCount = 0xF000000;
};
AsmJsScanner::AsmJsScanner(Utf16CharacterStream* stream)
: stream_(stream),
token_(kUninitialized),
preceding_token_(kUninitialized),
next_token_(kUninitialized),
position_(0),
preceding_position_(0),
next_position_(0),
rewind_(false),
in_local_scope_(false),
global_count_(0),
double_value_(0.0),
unsigned_value_(0),
preceded_by_newline_(false) {
#define V(name, _junk1, _junk2, _junk3) property_names_[#name] = kToken_##name;
STDLIB_MATH_FUNCTION_LIST(V)
STDLIB_ARRAY_TYPE_LIST(V)
#undef V
#define V(name, _junk1) property_names_[#name] = kToken_##name;
STDLIB_MATH_VALUE_LIST(V)
#undef V
#define V(name) property_names_[#name] = kToken_##name;
STDLIB_OTHER_LIST(V)
#undef V
#define V(name) global_names_[#name] = kToken_##name;
KEYWORD_NAME_LIST(V)
#undef V
Next();
}
void AsmJsScanner::Next() {
if (rewind_) {
preceding_token_ = token_;
preceding_position_ = position_;
token_ = next_token_;
position_ = next_position_;
next_token_ = kUninitialized;
next_position_ = 0;
rewind_ = false;
return;
}
if (token_ == kEndOfInput || token_ == kParseError) {
return;
}
#if DEBUG
if (FLAG_trace_asm_scanner) {
if (Token() == kDouble) {
PrintF("%lf ", AsDouble());
} else if (Token() == kUnsigned) {
PrintF("%" PRIu32 " ", AsUnsigned());
} else {
std::string name = Name(Token());
PrintF("%s ", name.c_str());
}
}
#endif
preceded_by_newline_ = false;
preceding_token_ = token_;
preceding_position_ = position_;
for (;;) {
position_ = stream_->pos();
uc32 ch = stream_->Advance();
switch (ch) {
case ' ':
case '\t':
case '\r':
// Ignore whitespace.
break;
case '\n':
// Track when we've passed a newline for optional semicolon support,
// but keep scanning.
preceded_by_newline_ = true;
break;
case kEndOfInput:
token_ = kEndOfInput;
return;
case '\'':
case '"':
ConsumeString(ch);
return;
case '/':
ch = stream_->Advance();
if (ch == '/') {
ConsumeCPPComment();
} else if (ch == '*') {
if (!ConsumeCComment()) {
token_ = kParseError;
return;
}
} else {
stream_->Back();
token_ = '/';
return;
}
// Breaks out of switch, but loops again (i.e. the case when we parsed
// a comment, but need to continue to look for the next token).
break;
case '<':
case '>':
case '=':
case '!':
ConsumeCompareOrShift(ch);
return;
#define V(single_char_token) case single_char_token:
SIMPLE_SINGLE_TOKEN_LIST(V)
#undef V
// Use fixed token IDs for ASCII.
token_ = ch;
return;
default:
if (IsIdentifierStart(ch)) {
ConsumeIdentifier(ch);
} else if (IsNumberStart(ch)) {
ConsumeNumber(ch);
} else {
// TODO(bradnelson): Support unicode (probably via UnicodeCache).
token_ = kParseError;
}
return;
}
}
}
void AsmJsScanner::Rewind() {
DCHECK_NE(kUninitialized, preceding_token_);
// TODO(bradnelson): Currently rewinding needs to leave in place the
// preceding newline state (in case a |0 ends a line).
// This is weird and stateful, fix me.
DCHECK(!rewind_);
next_token_ = token_;
next_position_ = position_;
token_ = preceding_token_;
position_ = preceding_position_;
preceding_token_ = kUninitialized;
preceding_position_ = 0;
rewind_ = true;
identifier_string_.clear();
}
void AsmJsScanner::ResetLocals() { local_names_.clear(); }
#if DEBUG
// Only used for debugging.
std::string AsmJsScanner::Name(token_t token) const {
if (token >= 32 && token < 127) {
return std::string(1, static_cast<char>(token));
}
for (auto& i : local_names_) {
if (i.second == token) {
return i.first;
}
}
for (auto& i : global_names_) {
if (i.second == token) {
return i.first;
}
}
for (auto& i : property_names_) {
if (i.second == token) {
return i.first;
}
}
switch (token) {
#define V(rawname, name) \
case kToken_##name: \
return rawname;
LONG_SYMBOL_NAME_LIST(V)
#undef V
#define V(name, value, string_name) \
case name: \
return string_name;
SPECIAL_TOKEN_LIST(V)
default:
break;
#undef V
}
UNREACHABLE();
}
#endif
void AsmJsScanner::Seek(size_t pos) {
stream_->Seek(pos);
preceding_token_ = kUninitialized;
token_ = kUninitialized;
next_token_ = kUninitialized;
preceding_position_ = 0;
position_ = 0;
next_position_ = 0;
rewind_ = false;
Next();
}
void AsmJsScanner::ConsumeIdentifier(uc32 ch) {
// Consume characters while still part of the identifier.
identifier_string_.clear();
while (IsIdentifierPart(ch)) {
identifier_string_ += ch;
ch = stream_->Advance();
}
// Go back one for next time.
stream_->Back();
// Decode what the identifier means.
if (preceding_token_ == '.') {
auto i = property_names_.find(identifier_string_);
if (i != property_names_.end()) {
token_ = i->second;
return;
}
} else {
{
auto i = local_names_.find(identifier_string_);
if (i != local_names_.end()) {
token_ = i->second;
return;
}
}
if (!in_local_scope_) {
auto i = global_names_.find(identifier_string_);
if (i != global_names_.end()) {
token_ = i->second;
return;
}
}
}
if (preceding_token_ == '.') {
CHECK_LT(global_count_, kMaxIdentifierCount);
token_ = kGlobalsStart + global_count_++;
property_names_[identifier_string_] = token_;
} else if (in_local_scope_) {
CHECK_LT(local_names_.size(), kMaxIdentifierCount);
token_ = kLocalsStart - static_cast<token_t>(local_names_.size());
local_names_[identifier_string_] = token_;
} else {
CHECK_LT(global_count_, kMaxIdentifierCount);
token_ = kGlobalsStart + global_count_++;
global_names_[identifier_string_] = token_;
}
}
void AsmJsScanner::ConsumeNumber(uc32 ch) {
std::string number;
number = ch;
bool has_dot = ch == '.';
bool has_prefix = false;
for (;;) {
ch = stream_->Advance();
if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') ||
(ch >= 'A' && ch <= 'F') || ch == '.' || ch == 'b' || ch == 'o' ||
ch == 'x' ||
((ch == '-' || ch == '+') && !has_prefix &&
(number[number.size() - 1] == 'e' ||
number[number.size() - 1] == 'E'))) {
// TODO(bradnelson): Test weird cases ending in -.
if (ch == '.') {
has_dot = true;
}
if (ch == 'b' || ch == 'o' || ch == 'x') {
has_prefix = true;
}
number.push_back(ch);
} else {
break;
}
}
stream_->Back();
// Special case the most common number.
if (number.size() == 1 && number[0] == '0') {
unsigned_value_ = 0;
token_ = kUnsigned;
return;
}
// Pick out dot.
if (number.size() == 1 && number[0] == '.') {
token_ = '.';
return;
}
// Decode numbers.
UnicodeCache cache;
double_value_ = StringToDouble(
&cache,
Vector<const uint8_t>(reinterpret_cast<const uint8_t*>(number.data()),
static_cast<int>(number.size())),
ALLOW_HEX | ALLOW_OCTAL | ALLOW_BINARY | ALLOW_IMPLICIT_OCTAL);
if (std::isnan(double_value_)) {
// Check if string to number conversion didn't consume all the characters.
// This happens if the character filter let through something invalid
// like: 0123ef for example.
// TODO(bradnelson): Check if this happens often enough to be a perf
// problem.
if (number[0] == '.') {
for (size_t k = 1; k < number.size(); ++k) {
stream_->Back();
}
token_ = '.';
return;
}
// Anything else that doesn't parse is an error.
token_ = kParseError;
return;
}
if (has_dot) {
token_ = kDouble;
} else {
// Exceeding safe integer range is an error.
if (double_value_ > static_cast<double>(kMaxUInt32)) {
token_ = kParseError;
return;
}
unsigned_value_ = static_cast<uint32_t>(double_value_);
token_ = kUnsigned;
}
}
bool AsmJsScanner::ConsumeCComment() {
for (;;) {
uc32 ch = stream_->Advance();
while (ch == '*') {
ch = stream_->Advance();
if (ch == '/') {
return true;
}
}
if (ch == kEndOfInput) {
return false;
}
}
}
void AsmJsScanner::ConsumeCPPComment() {
for (;;) {
uc32 ch = stream_->Advance();
if (ch == '\n' || ch == kEndOfInput) {
return;
}
}
}
void AsmJsScanner::ConsumeString(uc32 quote) {
// Only string allowed is 'use asm' / "use asm".
const char* expected = "use asm";
for (; *expected != '\0'; ++expected) {
if (stream_->Advance() != *expected) {
token_ = kParseError;
return;
}
}
if (stream_->Advance() != quote) {
token_ = kParseError;
return;
}
token_ = kToken_UseAsm;
}
void AsmJsScanner::ConsumeCompareOrShift(uc32 ch) {
uc32 next_ch = stream_->Advance();
if (next_ch == '=') {
switch (ch) {
case '<':
token_ = kToken_LE;
break;
case '>':
token_ = kToken_GE;
break;
case '=':
token_ = kToken_EQ;
break;
case '!':
token_ = kToken_NE;
break;
default:
UNREACHABLE();
}
} else if (ch == '<' && next_ch == '<') {
token_ = kToken_SHL;
} else if (ch == '>' && next_ch == '>') {
if (stream_->Advance() == '>') {
token_ = kToken_SHR;
} else {
token_ = kToken_SAR;
stream_->Back();
}
} else {
stream_->Back();
token_ = ch;
}
}
bool AsmJsScanner::IsIdentifierStart(uc32 ch) {
return IsInRange(AsciiAlphaToLower(ch), 'a', 'z') || ch == '_' || ch == '$';
}
bool AsmJsScanner::IsIdentifierPart(uc32 ch) { return IsAsciiIdentifier(ch); }
bool AsmJsScanner::IsNumberStart(uc32 ch) {
return ch == '.' || IsDecimalDigit(ch);
}
} // namespace internal
} // namespace v8