代码如下
其他的部分还没有完成,只考虑数值解析这一段。主要是从可读性跟性能两个方面看
package compile.craft;
import lombok.extern.slf4j.Slf4j;
import static compile.craft.CharUtils.*;
@Slf4j
public class Lexer {
boolean fetchedEOF = false;
private final String source;
private int pos = 0;
private char ch;
private int line = 0;
private int col = 0;
public Lexer(String source) {
this.source = source;
}
/**
* 1. line 维护
*
*/
public Token nextToken() {
if (fetchedEOF) {
return null;
}
while (true) {
do {
advance();
} while (isBlank(ch));
if (ch == EOF) {
fetchedEOF = true;
return null;
}
// 处理 // 注释
if (isIdentifierStart(ch)) {
return scanIdentifier();
}
if (ch == '.') {
return scanNumber();
}
if (CharUtils.isDigit(ch)) {
return scanNumber();
}
break;
}
return null;
}
private Token scanNumber() {
int start = pos - 1;
TokenKind tokenKind = null;
if (ch == '0') {
advance();
// HEX_LITERAL: '0' [xX] [0-9a-fA-F] ([0-9a-fA-F_]* [0-9a-fA-F])? [lL]?;
// HEX_FLOAT_LITERAL: '0' [xX] (HexDigits '.'? | HexDigits? '.' HexDigits) [pP] [+-]? Digits [fFdD]?;
// HexDigits: HexDigit ((HexDigit | '_')* HexDigit)?;
// Digits: [0-9] ([0-9_]* [0-9])?;
if (ch == 'x' || ch == 'X') {
if (peek() == '.') {
advance();
scanHexFraction(false);
String lexeme = source.substring(start, pos);
return new Token(TokenKind.HEX_FLOAT_LITERAL, lexeme);
} else {
scanHex();
if (ch == '.') {
scanHexFraction(true);
tokenKind = TokenKind.HEX_FLOAT_LITERAL;
} else if (ch == 'p' || ch == 'P') {
scanExp();
tokenKind = TokenKind.HEX_FLOAT_LITERAL;
} else if (ch == 'f' || ch == 'F' || ch == 'd' || ch == 'D') {
tokenKind = TokenKind.HEX_FLOAT_LITERAL;
} else if (ch == 'l' || ch == 'L') {
tokenKind = TokenKind.HEX_LITERAL;
} else {
retreat();
tokenKind = TokenKind.HEX_LITERAL;
}
String lexeme = source.substring(start, pos);
return new Token(tokenKind, lexeme);
}
} else if (ch == 'b' || ch == 'B') {
// BINARY_LITERAL: '0' [bB] [01] ([01_]* [01])? [lL]?;
scanBit();
if ((ch != 'l') && (ch != 'L')) {
retreat();
}
String lexeme = source.substring(start, pos);
return new Token(TokenKind.BINARY_LITERAL, lexeme);
} else if (ch == '_' || isOct(ch)) {
// OCT_LITERAL: '0' '_'* [0-7] ([0-7_]* [0-7])? [lL]?;
// FLOAT_LITERAL: (Digits '.' Digits? | '.' Digits) ExponentPart? [fFdD]?;
// FLOAT_LITERAL: Digits (ExponentPart [fFdD]? | [fFdD]);
scanOct();
if (isDigit(ch)) {
scanDigit();
if (ch == '.') {
scanFraction();
} else if (ch == 'e' || ch == 'E') {
scanExp();
} else if ((ch != 'f') && (ch != 'F') && (ch != 'd') && (ch != 'D')) {
error("invalid oct literal");
}
tokenKind = TokenKind.FLOAT_LITERAL;
} else if (ch == '.') {
scanFraction();
tokenKind = TokenKind.FLOAT_LITERAL;
} else if (ch == 'e' || ch == 'E') {
scanExp();
tokenKind = TokenKind.FLOAT_LITERAL;
} else if (ch == 'd' || ch == 'D' || ch == 'f' || ch == 'F') {
tokenKind = TokenKind.FLOAT_LITERAL;
} else if (ch == 'l' || ch == 'L') {
tokenKind = TokenKind.OCT_LITERAL;
} else {
retreat();
tokenKind = TokenKind.OCT_LITERAL;
}
String lexeme = source.substring(start, pos);
return new Token(tokenKind, lexeme);
} else {
// FLOAT_LITERAL: (Digits '.' Digits? | '.' Digits) ExponentPart? [fFdD]?;
// FLOAT_LITERAL: Digits (ExponentPart [fFdD]? | [fFdD]);
// DECIMAL_LITERAL: ('0' | [1-9] (Digits? | '_'+ Digits)) [lL]?;
// ExponentPart: [eE] [+-]? Digits;
if (isDigit(ch)) {
do {
advance();
} while (isDigit(ch));
if (ch == '.') {
scanFraction();
tokenKind = TokenKind.FLOAT_LITERAL;
} else if (ch == 'e' || ch == 'E') {
scanExp();
tokenKind = TokenKind.FLOAT_LITERAL;
} else if ((ch == 'f') || ch == 'F' || ch == 'd' || ch == 'D') {
tokenKind = TokenKind.FLOAT_LITERAL;
} else {
error("invalid float literal");
}
} else if (ch == '.') {
scanFraction();
tokenKind = TokenKind.FLOAT_LITERAL;
} else if (ch == 'e' || ch == 'E') {
scanExp();
tokenKind = TokenKind.FLOAT_LITERAL;
} else if (ch == 'l' || ch == 'L') {
tokenKind = TokenKind.DECIMAL_LITERAL;
} else if (ch == 'f' || ch == 'F' || ch == 'd' || ch == 'D') {
tokenKind = TokenKind.FLOAT_LITERAL;
} else {
retreat();
tokenKind = TokenKind.DECIMAL_LITERAL;
}
String lexeme = source.substring(start, pos);
return new Token(tokenKind, lexeme);
}
} else {
// FLOAT_LITERAL: (Digits '.' Digits? | '.' Digits) ExponentPart? [fFdD]?;
// FLOAT_LITERAL: Digits (ExponentPart [fFdD]? | [fFdD]);
// DECIMAL_LITERAL: ('0' | [1-9] (Digits? | '_'+ Digits)) [lL]?;
// ExponentPart: [eE] [+-]? Digits;
if (ch == '.') {
scanDigit();
if (ch == 'e' || ch == 'E') {
scanExp();
}
tokenKind = TokenKind.FLOAT_LITERAL;
} else {
scanDigit(true);
if (ch == '.') {
scanFraction();
tokenKind = TokenKind.FLOAT_LITERAL;
} else if (ch == 'e' || ch == 'E') {
scanExp();
tokenKind = TokenKind.FLOAT_LITERAL;
} else if (ch == 'f' || ch == 'F' || ch == 'd' || ch == 'D') {
tokenKind = TokenKind.FLOAT_LITERAL;
} else if (ch == 'l' || ch == 'L') {
tokenKind = TokenKind.DECIMAL_LITERAL;
} else {
retreat();
tokenKind = TokenKind.DECIMAL_LITERAL;
}
}
String lexeme = source.substring(start, pos);
return new Token(tokenKind, lexeme);
}
}
private Token scanIdentifier() {
int start = pos - 1;
do {
advance();
} while (isIdentifierChar(ch));
String lexeme = source.substring(start, pos);
retreat();
if (Token.isKeyword(lexeme)) {
TokenKind kind = Token.kind(lexeme);
return new Token(kind, kind.literal);
} else {
return new Token(TokenKind.IDENTIFIER, lexeme);
}
}
private void scanHex() {
advance();
if (!isHex(ch)) {
error("invalid hexadecimal literal");
}
do {
if (ch == '_') {
do {
advance();
} while(ch == '_');
if (!isHex(ch)) {
error("invalid hexadecimal literal");
}
}
do {
advance();
} while (isHex(ch));
} while (ch == '_');
}
private void scanBit() {
advance();
if (!isBit(ch)) {
error("invalid binary literal");
}
do {
if (ch == '_') {
do {
advance();
} while(ch == '_');
if (!isBit(ch)) {
error("invalid binary literal");
}
}
do {
advance();
} while (isBit(ch));
} while (ch == '_');
}
private void scanOct() {
do {
if (ch == '_') {
do {
advance();
} while(ch == '_');
if (!isDigit(ch)) {
error("invalid octal literal");
}
if (!isOct(ch)) {
return;
}
}
do {
advance();
} while (isOct(ch));
} while (ch == '_');
}
private void scanDigit(boolean hasDigit) {
advance();
if (hasDigit) {
if ((ch != '_') && !isDigit(ch)) {
return;
}
} else if (!isDigit(ch)) {
error("invalid decimal literal");
}
do {
if (ch == '_') {
do {
advance();
} while(ch == '_');
if (!isDigit(ch)) {
error("invalid decimal literal");
}
}
do {
advance();
} while (isDigit(ch));
} while (ch == '_');
}
private void scanDigit() {
scanDigit(false);
}
// FLOAT_LITERAL: (Digits '.' Digits? | '.' Digits) ExponentPart? [fFdD]?;
private void scanFraction() {
if (isDigit(peek())) {
scanDigit();
} else {
advance();
}
if (ch == 'e' || ch == 'E') {
char c = peek();
if (c == '+' || c == '-') {
advance();
}
scanDigit();
}
if ((ch != 'f') && (ch != 'F') && (ch != 'd') && (ch != 'D')) {
retreat();
}
}
// HEX_FLOAT_LITERAL: '0' [xX] (HexDigits '.'? | HexDigits? '.' HexDigits) [pP] [+-]? Digits [fFdD]?;
private void scanHexFraction(boolean hasDigit) {
if (hasDigit) {
if (isHex(peek())) {
scanHex();
} else {
advance();
}
} else {
scanHex();
}
if ((ch != 'p') && (ch != 'P')) {
error("invalid hexadecimal literal");
}
char c = peek();
if (c == '+' || c == '-') {
advance();
}
scanDigit();
if ((ch != 'f') && (ch != 'F') && (ch != 'd') && (ch != 'D')) {
retreat();
}
}
private void scanExp() {
char c = peek();
if (c == '+' || c == '-') {
advance();
}
scanDigit();
if ((ch != 'f') && (ch != 'F') && (ch != 'd') && (ch != 'D')) {
retreat();
}
}
private void advance() {
if (pos >= source.length()) {
ch = CharUtils.EOF;
return;
}
ch = source.charAt(pos++);
}
private void retreat() {
if (ch != EOF) {
if (--pos < 0) {
error("tokenizer exceed beginning of source");
}
}
}
private char peek() {
if (pos >= source.length()) {
return EOF;
}
return source.charAt(pos);
}
private void error(String msg) {
log.error("lexer error: {}", msg);
throw new RuntimeException(msg);
}
}
这是一个专为移动设备优化的页面(即为了让你能够在 Google 搜索结果里秒开这个页面),如果你希望参与 V2EX 社区的讨论,你可以继续到 V2EX 上打开本讨论主题的完整版本。
V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。
V2EX is a community of developers, designers and creative people.