404 lines
17 KiB
JavaScript
404 lines
17 KiB
JavaScript
![]() |
/*---------------------------------------------------------------------------------------------
|
||
|
* Copyright (c) Microsoft Corporation. All rights reserved.
|
||
|
* Licensed under the MIT License. See License.txt in the project root for license information.
|
||
|
*--------------------------------------------------------------------------------------------*/
|
||
|
import * as l10n from '@vscode/l10n';
|
||
|
import { TokenType, ScannerState } from '../htmlLanguageTypes';
|
||
|
class MultiLineStream {
|
||
|
constructor(source, position) {
|
||
|
this.source = source;
|
||
|
this.len = source.length;
|
||
|
this.position = position;
|
||
|
}
|
||
|
eos() {
|
||
|
return this.len <= this.position;
|
||
|
}
|
||
|
getSource() {
|
||
|
return this.source;
|
||
|
}
|
||
|
pos() {
|
||
|
return this.position;
|
||
|
}
|
||
|
goBackTo(pos) {
|
||
|
this.position = pos;
|
||
|
}
|
||
|
goBack(n) {
|
||
|
this.position -= n;
|
||
|
}
|
||
|
advance(n) {
|
||
|
this.position += n;
|
||
|
}
|
||
|
goToEnd() {
|
||
|
this.position = this.source.length;
|
||
|
}
|
||
|
nextChar() {
|
||
|
return this.source.charCodeAt(this.position++) || 0;
|
||
|
}
|
||
|
peekChar(n = 0) {
|
||
|
return this.source.charCodeAt(this.position + n) || 0;
|
||
|
}
|
||
|
advanceIfChar(ch) {
|
||
|
if (ch === this.source.charCodeAt(this.position)) {
|
||
|
this.position++;
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
advanceIfChars(ch) {
|
||
|
let i;
|
||
|
if (this.position + ch.length > this.source.length) {
|
||
|
return false;
|
||
|
}
|
||
|
for (i = 0; i < ch.length; i++) {
|
||
|
if (this.source.charCodeAt(this.position + i) !== ch[i]) {
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
this.advance(i);
|
||
|
return true;
|
||
|
}
|
||
|
advanceIfRegExp(regex) {
|
||
|
const str = this.source.substr(this.position);
|
||
|
const match = str.match(regex);
|
||
|
if (match) {
|
||
|
this.position = this.position + match.index + match[0].length;
|
||
|
return match[0];
|
||
|
}
|
||
|
return '';
|
||
|
}
|
||
|
advanceUntilRegExp(regex) {
|
||
|
const str = this.source.substr(this.position);
|
||
|
const match = str.match(regex);
|
||
|
if (match) {
|
||
|
this.position = this.position + match.index;
|
||
|
return match[0];
|
||
|
}
|
||
|
else {
|
||
|
this.goToEnd();
|
||
|
}
|
||
|
return '';
|
||
|
}
|
||
|
advanceUntilChar(ch) {
|
||
|
while (this.position < this.source.length) {
|
||
|
if (this.source.charCodeAt(this.position) === ch) {
|
||
|
return true;
|
||
|
}
|
||
|
this.advance(1);
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
advanceUntilChars(ch) {
|
||
|
while (this.position + ch.length <= this.source.length) {
|
||
|
let i = 0;
|
||
|
for (; i < ch.length && this.source.charCodeAt(this.position + i) === ch[i]; i++) {
|
||
|
}
|
||
|
if (i === ch.length) {
|
||
|
return true;
|
||
|
}
|
||
|
this.advance(1);
|
||
|
}
|
||
|
this.goToEnd();
|
||
|
return false;
|
||
|
}
|
||
|
skipWhitespace() {
|
||
|
const n = this.advanceWhileChar(ch => {
|
||
|
return ch === _WSP || ch === _TAB || ch === _NWL || ch === _LFD || ch === _CAR;
|
||
|
});
|
||
|
return n > 0;
|
||
|
}
|
||
|
advanceWhileChar(condition) {
|
||
|
const posNow = this.position;
|
||
|
while (this.position < this.len && condition(this.source.charCodeAt(this.position))) {
|
||
|
this.position++;
|
||
|
}
|
||
|
return this.position - posNow;
|
||
|
}
|
||
|
}
|
||
|
const _BNG = '!'.charCodeAt(0);
|
||
|
const _MIN = '-'.charCodeAt(0);
|
||
|
const _LAN = '<'.charCodeAt(0);
|
||
|
const _RAN = '>'.charCodeAt(0);
|
||
|
const _FSL = '/'.charCodeAt(0);
|
||
|
const _EQS = '='.charCodeAt(0);
|
||
|
const _DQO = '"'.charCodeAt(0);
|
||
|
const _SQO = '\''.charCodeAt(0);
|
||
|
const _NWL = '\n'.charCodeAt(0);
|
||
|
const _CAR = '\r'.charCodeAt(0);
|
||
|
const _LFD = '\f'.charCodeAt(0);
|
||
|
const _WSP = ' '.charCodeAt(0);
|
||
|
const _TAB = '\t'.charCodeAt(0);
|
||
|
const htmlScriptContents = {
|
||
|
'text/x-handlebars-template': true,
|
||
|
// Fix for https://github.com/microsoft/vscode/issues/77977
|
||
|
'text/html': true,
|
||
|
};
|
||
|
export function createScanner(input, initialOffset = 0, initialState = ScannerState.WithinContent, emitPseudoCloseTags = false) {
|
||
|
const stream = new MultiLineStream(input, initialOffset);
|
||
|
let state = initialState;
|
||
|
let tokenOffset = 0;
|
||
|
let tokenType = TokenType.Unknown;
|
||
|
let tokenError;
|
||
|
let hasSpaceAfterTag;
|
||
|
let lastTag;
|
||
|
let lastAttributeName;
|
||
|
let lastTypeValue;
|
||
|
function nextElementName() {
|
||
|
return stream.advanceIfRegExp(/^[_:\w][_:\w-.\d]*/).toLowerCase();
|
||
|
}
|
||
|
function nextAttributeName() {
|
||
|
return stream.advanceIfRegExp(/^[^\s"'></=\x00-\x0F\x7F\x80-\x9F]*/).toLowerCase();
|
||
|
}
|
||
|
function finishToken(offset, type, errorMessage) {
|
||
|
tokenType = type;
|
||
|
tokenOffset = offset;
|
||
|
tokenError = errorMessage;
|
||
|
return type;
|
||
|
}
|
||
|
function scan() {
|
||
|
const offset = stream.pos();
|
||
|
const oldState = state;
|
||
|
const token = internalScan();
|
||
|
if (token !== TokenType.EOS && offset === stream.pos() && !(emitPseudoCloseTags && (token === TokenType.StartTagClose || token === TokenType.EndTagClose))) {
|
||
|
console.warn('Scanner.scan has not advanced at offset ' + offset + ', state before: ' + oldState + ' after: ' + state);
|
||
|
stream.advance(1);
|
||
|
return finishToken(offset, TokenType.Unknown);
|
||
|
}
|
||
|
return token;
|
||
|
}
|
||
|
function internalScan() {
|
||
|
const offset = stream.pos();
|
||
|
if (stream.eos()) {
|
||
|
return finishToken(offset, TokenType.EOS);
|
||
|
}
|
||
|
let errorMessage;
|
||
|
switch (state) {
|
||
|
case ScannerState.WithinComment:
|
||
|
if (stream.advanceIfChars([_MIN, _MIN, _RAN])) { // -->
|
||
|
state = ScannerState.WithinContent;
|
||
|
return finishToken(offset, TokenType.EndCommentTag);
|
||
|
}
|
||
|
stream.advanceUntilChars([_MIN, _MIN, _RAN]); // -->
|
||
|
return finishToken(offset, TokenType.Comment);
|
||
|
case ScannerState.WithinDoctype:
|
||
|
if (stream.advanceIfChar(_RAN)) {
|
||
|
state = ScannerState.WithinContent;
|
||
|
return finishToken(offset, TokenType.EndDoctypeTag);
|
||
|
}
|
||
|
stream.advanceUntilChar(_RAN); // >
|
||
|
return finishToken(offset, TokenType.Doctype);
|
||
|
case ScannerState.WithinContent:
|
||
|
if (stream.advanceIfChar(_LAN)) { // <
|
||
|
if (!stream.eos() && stream.peekChar() === _BNG) { // !
|
||
|
if (stream.advanceIfChars([_BNG, _MIN, _MIN])) { // <!--
|
||
|
state = ScannerState.WithinComment;
|
||
|
return finishToken(offset, TokenType.StartCommentTag);
|
||
|
}
|
||
|
if (stream.advanceIfRegExp(/^!doctype/i)) {
|
||
|
state = ScannerState.WithinDoctype;
|
||
|
return finishToken(offset, TokenType.StartDoctypeTag);
|
||
|
}
|
||
|
}
|
||
|
if (stream.advanceIfChar(_FSL)) { // /
|
||
|
state = ScannerState.AfterOpeningEndTag;
|
||
|
return finishToken(offset, TokenType.EndTagOpen);
|
||
|
}
|
||
|
state = ScannerState.AfterOpeningStartTag;
|
||
|
return finishToken(offset, TokenType.StartTagOpen);
|
||
|
}
|
||
|
stream.advanceUntilChar(_LAN);
|
||
|
return finishToken(offset, TokenType.Content);
|
||
|
case ScannerState.AfterOpeningEndTag:
|
||
|
const tagName = nextElementName();
|
||
|
if (tagName.length > 0) {
|
||
|
state = ScannerState.WithinEndTag;
|
||
|
return finishToken(offset, TokenType.EndTag);
|
||
|
}
|
||
|
if (stream.skipWhitespace()) { // white space is not valid here
|
||
|
return finishToken(offset, TokenType.Whitespace, l10n.t('Tag name must directly follow the open bracket.'));
|
||
|
}
|
||
|
state = ScannerState.WithinEndTag;
|
||
|
stream.advanceUntilChar(_RAN);
|
||
|
if (offset < stream.pos()) {
|
||
|
return finishToken(offset, TokenType.Unknown, l10n.t('End tag name expected.'));
|
||
|
}
|
||
|
return internalScan();
|
||
|
case ScannerState.WithinEndTag:
|
||
|
if (stream.skipWhitespace()) { // white space is valid here
|
||
|
return finishToken(offset, TokenType.Whitespace);
|
||
|
}
|
||
|
if (stream.advanceIfChar(_RAN)) { // >
|
||
|
state = ScannerState.WithinContent;
|
||
|
return finishToken(offset, TokenType.EndTagClose);
|
||
|
}
|
||
|
if (emitPseudoCloseTags && stream.peekChar() === _LAN) { // <
|
||
|
state = ScannerState.WithinContent;
|
||
|
return finishToken(offset, TokenType.EndTagClose, l10n.t('Closing bracket missing.'));
|
||
|
}
|
||
|
errorMessage = l10n.t('Closing bracket expected.');
|
||
|
break;
|
||
|
case ScannerState.AfterOpeningStartTag:
|
||
|
lastTag = nextElementName();
|
||
|
lastTypeValue = void 0;
|
||
|
lastAttributeName = void 0;
|
||
|
if (lastTag.length > 0) {
|
||
|
hasSpaceAfterTag = false;
|
||
|
state = ScannerState.WithinTag;
|
||
|
return finishToken(offset, TokenType.StartTag);
|
||
|
}
|
||
|
if (stream.skipWhitespace()) { // white space is not valid here
|
||
|
return finishToken(offset, TokenType.Whitespace, l10n.t('Tag name must directly follow the open bracket.'));
|
||
|
}
|
||
|
state = ScannerState.WithinTag;
|
||
|
stream.advanceUntilChar(_RAN);
|
||
|
if (offset < stream.pos()) {
|
||
|
return finishToken(offset, TokenType.Unknown, l10n.t('Start tag name expected.'));
|
||
|
}
|
||
|
return internalScan();
|
||
|
case ScannerState.WithinTag:
|
||
|
if (stream.skipWhitespace()) {
|
||
|
hasSpaceAfterTag = true; // remember that we have seen a whitespace
|
||
|
return finishToken(offset, TokenType.Whitespace);
|
||
|
}
|
||
|
if (hasSpaceAfterTag) {
|
||
|
lastAttributeName = nextAttributeName();
|
||
|
if (lastAttributeName.length > 0) {
|
||
|
state = ScannerState.AfterAttributeName;
|
||
|
hasSpaceAfterTag = false;
|
||
|
return finishToken(offset, TokenType.AttributeName);
|
||
|
}
|
||
|
}
|
||
|
if (stream.advanceIfChars([_FSL, _RAN])) { // />
|
||
|
state = ScannerState.WithinContent;
|
||
|
return finishToken(offset, TokenType.StartTagSelfClose);
|
||
|
}
|
||
|
if (stream.advanceIfChar(_RAN)) { // >
|
||
|
if (lastTag === 'script') {
|
||
|
if (lastTypeValue && htmlScriptContents[lastTypeValue]) {
|
||
|
// stay in html
|
||
|
state = ScannerState.WithinContent;
|
||
|
}
|
||
|
else {
|
||
|
state = ScannerState.WithinScriptContent;
|
||
|
}
|
||
|
}
|
||
|
else if (lastTag === 'style') {
|
||
|
state = ScannerState.WithinStyleContent;
|
||
|
}
|
||
|
else {
|
||
|
state = ScannerState.WithinContent;
|
||
|
}
|
||
|
return finishToken(offset, TokenType.StartTagClose);
|
||
|
}
|
||
|
if (emitPseudoCloseTags && stream.peekChar() === _LAN) { // <
|
||
|
state = ScannerState.WithinContent;
|
||
|
return finishToken(offset, TokenType.StartTagClose, l10n.t('Closing bracket missing.'));
|
||
|
}
|
||
|
stream.advance(1);
|
||
|
return finishToken(offset, TokenType.Unknown, l10n.t('Unexpected character in tag.'));
|
||
|
case ScannerState.AfterAttributeName:
|
||
|
if (stream.skipWhitespace()) {
|
||
|
hasSpaceAfterTag = true;
|
||
|
return finishToken(offset, TokenType.Whitespace);
|
||
|
}
|
||
|
if (stream.advanceIfChar(_EQS)) {
|
||
|
state = ScannerState.BeforeAttributeValue;
|
||
|
return finishToken(offset, TokenType.DelimiterAssign);
|
||
|
}
|
||
|
state = ScannerState.WithinTag;
|
||
|
return internalScan(); // no advance yet - jump to WithinTag
|
||
|
case ScannerState.BeforeAttributeValue:
|
||
|
if (stream.skipWhitespace()) {
|
||
|
return finishToken(offset, TokenType.Whitespace);
|
||
|
}
|
||
|
let attributeValue = stream.advanceIfRegExp(/^[^\s"'`=<>]+/);
|
||
|
if (attributeValue.length > 0) {
|
||
|
if (stream.peekChar() === _RAN && stream.peekChar(-1) === _FSL) { // <foo bar=http://foo/>
|
||
|
stream.goBack(1);
|
||
|
attributeValue = attributeValue.substring(0, attributeValue.length - 1);
|
||
|
}
|
||
|
if (lastAttributeName === 'type') {
|
||
|
lastTypeValue = attributeValue;
|
||
|
}
|
||
|
if (attributeValue.length > 0) {
|
||
|
state = ScannerState.WithinTag;
|
||
|
hasSpaceAfterTag = false;
|
||
|
return finishToken(offset, TokenType.AttributeValue);
|
||
|
}
|
||
|
}
|
||
|
const ch = stream.peekChar();
|
||
|
if (ch === _SQO || ch === _DQO) {
|
||
|
stream.advance(1); // consume quote
|
||
|
if (stream.advanceUntilChar(ch)) {
|
||
|
stream.advance(1); // consume quote
|
||
|
}
|
||
|
if (lastAttributeName === 'type') {
|
||
|
lastTypeValue = stream.getSource().substring(offset + 1, stream.pos() - 1);
|
||
|
}
|
||
|
state = ScannerState.WithinTag;
|
||
|
hasSpaceAfterTag = false;
|
||
|
return finishToken(offset, TokenType.AttributeValue);
|
||
|
}
|
||
|
state = ScannerState.WithinTag;
|
||
|
hasSpaceAfterTag = false;
|
||
|
return internalScan(); // no advance yet - jump to WithinTag
|
||
|
case ScannerState.WithinScriptContent:
|
||
|
// see http://stackoverflow.com/questions/14574471/how-do-browsers-parse-a-script-tag-exactly
|
||
|
let sciptState = 1;
|
||
|
while (!stream.eos()) {
|
||
|
const match = stream.advanceIfRegExp(/<!--|-->|<\/?script\s*\/?>?/i);
|
||
|
if (match.length === 0) {
|
||
|
stream.goToEnd();
|
||
|
return finishToken(offset, TokenType.Script);
|
||
|
}
|
||
|
else if (match === '<!--') {
|
||
|
if (sciptState === 1) {
|
||
|
sciptState = 2;
|
||
|
}
|
||
|
}
|
||
|
else if (match === '-->') {
|
||
|
sciptState = 1;
|
||
|
}
|
||
|
else if (match[1] !== '/') { // <script
|
||
|
if (sciptState === 2) {
|
||
|
sciptState = 3;
|
||
|
}
|
||
|
}
|
||
|
else { // </script
|
||
|
if (sciptState === 3) {
|
||
|
sciptState = 2;
|
||
|
}
|
||
|
else {
|
||
|
stream.goBack(match.length); // to the beginning of the closing tag
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
state = ScannerState.WithinContent;
|
||
|
if (offset < stream.pos()) {
|
||
|
return finishToken(offset, TokenType.Script);
|
||
|
}
|
||
|
return internalScan(); // no advance yet - jump to content
|
||
|
case ScannerState.WithinStyleContent:
|
||
|
stream.advanceUntilRegExp(/<\/style/i);
|
||
|
state = ScannerState.WithinContent;
|
||
|
if (offset < stream.pos()) {
|
||
|
return finishToken(offset, TokenType.Styles);
|
||
|
}
|
||
|
return internalScan(); // no advance yet - jump to content
|
||
|
}
|
||
|
stream.advance(1);
|
||
|
state = ScannerState.WithinContent;
|
||
|
return finishToken(offset, TokenType.Unknown, errorMessage);
|
||
|
}
|
||
|
return {
|
||
|
scan,
|
||
|
getTokenType: () => tokenType,
|
||
|
getTokenOffset: () => tokenOffset,
|
||
|
getTokenLength: () => stream.pos() - tokenOffset,
|
||
|
getTokenEnd: () => stream.pos(),
|
||
|
getTokenText: () => stream.getSource().substring(tokenOffset, stream.pos()),
|
||
|
getScannerState: () => state,
|
||
|
getTokenError: () => tokenError
|
||
|
};
|
||
|
}
|