🎉 initiate project *astro_rewrite*
This commit is contained in:
parent
ffd4d5e86c
commit
2ba37bfbe3
8658 changed files with 2268794 additions and 2538 deletions
424
node_modules/parse-latin/lib/index.js
generated
vendored
Normal file
424
node_modules/parse-latin/lib/index.js
generated
vendored
Normal file
|
|
@ -0,0 +1,424 @@
|
|||
import {mergeInitialWordSymbol} from './plugin/merge-initial-word-symbol.js'
|
||||
import {mergeFinalWordSymbol} from './plugin/merge-final-word-symbol.js'
|
||||
import {mergeInnerWordSymbol} from './plugin/merge-inner-word-symbol.js'
|
||||
import {mergeInnerWordSlash} from './plugin/merge-inner-word-slash.js'
|
||||
import {mergeInitialisms} from './plugin/merge-initialisms.js'
|
||||
import {mergeWords} from './plugin/merge-words.js'
|
||||
import {patchPosition} from './plugin/patch-position.js'
|
||||
import {mergeNonWordSentences} from './plugin/merge-non-word-sentences.js'
|
||||
import {mergeAffixSymbol} from './plugin/merge-affix-symbol.js'
|
||||
import {mergeInitialLowerCaseLetterSentences} from './plugin/merge-initial-lower-case-letter-sentences.js'
|
||||
import {mergeInitialDigitSentences} from './plugin/merge-initial-digit-sentences.js'
|
||||
import {mergePrefixExceptions} from './plugin/merge-prefix-exceptions.js'
|
||||
import {mergeAffixExceptions} from './plugin/merge-affix-exceptions.js'
|
||||
import {mergeRemainingFullStops} from './plugin/merge-remaining-full-stops.js'
|
||||
import {makeInitialWhiteSpaceSiblings} from './plugin/make-initial-white-space-siblings.js'
|
||||
import {makeFinalWhiteSpaceSiblings} from './plugin/make-final-white-space-siblings.js'
|
||||
import {breakImplicitSentences} from './plugin/break-implicit-sentences.js'
|
||||
import {removeEmptyNodes} from './plugin/remove-empty-nodes.js'
|
||||
import {parserFactory} from './parser.js'
|
||||
import {
|
||||
newLine,
|
||||
punctuation,
|
||||
surrogates,
|
||||
terminalMarker,
|
||||
whiteSpace,
|
||||
word
|
||||
} from './expressions.js'
|
||||
|
||||
// PARSE LATIN
|
||||
|
||||
// Transform Latin-script natural language into an NLCST-tree.
|
||||
export class ParseLatin {
|
||||
constructor(doc, file) {
|
||||
const value = file || doc
|
||||
this.doc = value ? String(value) : null
|
||||
}
|
||||
|
||||
// Run transform plugins for `key` on `nodes`.
|
||||
run(key, nodes) {
|
||||
const wareKey = key + 'Plugins'
|
||||
const plugins = this[wareKey]
|
||||
let index = -1
|
||||
|
||||
if (plugins) {
|
||||
while (plugins[++index]) {
|
||||
plugins[index](nodes)
|
||||
}
|
||||
}
|
||||
|
||||
return nodes
|
||||
}
|
||||
|
||||
// Easy access to the document parser. This additionally supports retext-style
|
||||
// invocation: where an instance is created for each file, and the file is given
|
||||
// on construction.
|
||||
parse(value) {
|
||||
return this.tokenizeRoot(value || this.doc)
|
||||
}
|
||||
|
||||
// Transform a `value` into a list of `NLCSTNode`s.
|
||||
tokenize(value) {
|
||||
const tokens = []
|
||||
|
||||
if (value === null || value === undefined) {
|
||||
value = ''
|
||||
} else if (value instanceof String) {
|
||||
value = value.toString()
|
||||
}
|
||||
|
||||
if (typeof value !== 'string') {
|
||||
// Return the given nodes if this is either an empty array, or an array with
|
||||
// a node as a first child.
|
||||
if ('length' in value && (!value[0] || value[0].type)) {
|
||||
return value
|
||||
}
|
||||
|
||||
throw new Error(
|
||||
"Illegal invocation: '" +
|
||||
value +
|
||||
"' is not a valid argument for 'ParseLatin'"
|
||||
)
|
||||
}
|
||||
|
||||
if (!value) {
|
||||
return tokens
|
||||
}
|
||||
|
||||
// Eat mechanism to use.
|
||||
const eater = this.position ? eat : noPositionEat
|
||||
|
||||
let index = 0
|
||||
let offset = 0
|
||||
let line = 1
|
||||
let column = 1
|
||||
let previous = ''
|
||||
let queue = ''
|
||||
let left
|
||||
let right
|
||||
let character
|
||||
|
||||
while (index < value.length) {
|
||||
character = value.charAt(index)
|
||||
|
||||
if (whiteSpace.test(character)) {
|
||||
right = 'WhiteSpace'
|
||||
} else if (punctuation.test(character)) {
|
||||
right = 'Punctuation'
|
||||
} else if (word.test(character)) {
|
||||
right = 'Word'
|
||||
} else {
|
||||
right = 'Symbol'
|
||||
}
|
||||
|
||||
tick.call(this)
|
||||
|
||||
previous = character
|
||||
character = ''
|
||||
left = right
|
||||
right = null
|
||||
|
||||
index++
|
||||
}
|
||||
|
||||
tick.call(this)
|
||||
|
||||
return tokens
|
||||
|
||||
// Check one character.
|
||||
function tick() {
|
||||
if (
|
||||
left === right &&
|
||||
(left === 'Word' ||
|
||||
left === 'WhiteSpace' ||
|
||||
character === previous ||
|
||||
surrogates.test(character))
|
||||
) {
|
||||
queue += character
|
||||
} else {
|
||||
// Flush the previous queue.
|
||||
if (queue) {
|
||||
this['tokenize' + left](queue, eater)
|
||||
}
|
||||
|
||||
queue = character
|
||||
}
|
||||
}
|
||||
|
||||
// Remove `subvalue` from `value`.
|
||||
// Expects `subvalue` to be at the start from `value`, and applies no
|
||||
// validation.
|
||||
function eat(subvalue) {
|
||||
const pos = position()
|
||||
|
||||
update(subvalue)
|
||||
|
||||
return apply
|
||||
|
||||
// Add the given arguments, add `position` to the returned node, and return
|
||||
// the node.
|
||||
function apply(...input) {
|
||||
return pos(add(...input))
|
||||
}
|
||||
}
|
||||
|
||||
// Remove `subvalue` from `value`.
|
||||
// Does not patch positional information.
|
||||
function noPositionEat() {
|
||||
return add
|
||||
}
|
||||
|
||||
// Add mechanism.
|
||||
function add(node, parent) {
|
||||
if (parent) {
|
||||
parent.children.push(node)
|
||||
} else {
|
||||
tokens.push(node)
|
||||
}
|
||||
|
||||
return node
|
||||
}
|
||||
|
||||
// Mark position and patch `node.position`.
|
||||
function position() {
|
||||
const before = now()
|
||||
|
||||
// Add the position to a node.
|
||||
function patch(node) {
|
||||
node.position = new Position(before)
|
||||
|
||||
return node
|
||||
}
|
||||
|
||||
return patch
|
||||
}
|
||||
|
||||
// Update line and column based on `value`.
|
||||
function update(subvalue) {
|
||||
let character = -1
|
||||
let lastIndex = -1
|
||||
|
||||
offset += subvalue.length
|
||||
|
||||
while (++character < subvalue.length) {
|
||||
if (subvalue.charAt(character) === '\n') {
|
||||
lastIndex = character
|
||||
line++
|
||||
}
|
||||
}
|
||||
|
||||
if (lastIndex < 0) {
|
||||
column += subvalue.length
|
||||
} else {
|
||||
column = subvalue.length - lastIndex
|
||||
}
|
||||
}
|
||||
|
||||
// Store position information for a node.
|
||||
function Position(start) {
|
||||
this.start = start
|
||||
this.end = now()
|
||||
}
|
||||
|
||||
// Get the current position.
|
||||
function now() {
|
||||
return {line, column, offset}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Default position.
|
||||
ParseLatin.prototype.position = true
|
||||
|
||||
// Create text nodes.
|
||||
ParseLatin.prototype.tokenizeSymbol = createTextFactory('Symbol')
|
||||
ParseLatin.prototype.tokenizeWhiteSpace = createTextFactory('WhiteSpace')
|
||||
ParseLatin.prototype.tokenizePunctuation = createTextFactory('Punctuation')
|
||||
ParseLatin.prototype.tokenizeSource = createTextFactory('Source')
|
||||
ParseLatin.prototype.tokenizeText = createTextFactory('Text')
|
||||
|
||||
// Inject `plugins` to modifiy the result of the method at `key` on the operated
|
||||
// on context.
|
||||
ParseLatin.prototype.use = useFactory(function (context, key, plugins) {
|
||||
context[key] = context[key].concat(plugins)
|
||||
})
|
||||
|
||||
// Inject `plugins` to modifiy the result of the method at `key` on the operated
|
||||
// on context, before any other.
|
||||
ParseLatin.prototype.useFirst = useFactory(function (context, key, plugins) {
|
||||
context[key] = plugins.concat(context[key])
|
||||
})
|
||||
|
||||
// PARENT NODES
|
||||
//
|
||||
// All these nodes are `pluggable`: they come with a `use` method which accepts
|
||||
// a plugin (`function(NLCSTNode)`).
|
||||
// Every time one of these methods are called, the plugin is invoked with the
|
||||
// node, allowing for easy modification.
|
||||
//
|
||||
// In fact, the internal transformation from `tokenize` (a list of words, white
|
||||
// space, punctuation, and symbols) to `tokenizeRoot` (an NLCST tree), is also
|
||||
// implemented through this mechanism.
|
||||
|
||||
// Create a `WordNode` with its children set to a single `TextNode`, its value
|
||||
// set to the given `value`.
|
||||
pluggable(ParseLatin, 'tokenizeWord', function (value, eat) {
|
||||
const add = (eat || noopEat)('')
|
||||
const parent = {type: 'WordNode', children: []}
|
||||
|
||||
this.tokenizeText(value, eat, parent)
|
||||
|
||||
return add(parent)
|
||||
})
|
||||
|
||||
// Create a `SentenceNode` with its children set to `Node`s, their values set
|
||||
// to the tokenized given `value`.
|
||||
//
|
||||
// Unless plugins add new nodes, the sentence is populated by `WordNode`s,
|
||||
// `SymbolNode`s, `PunctuationNode`s, and `WhiteSpaceNode`s.
|
||||
pluggable(
|
||||
ParseLatin,
|
||||
'tokenizeSentence',
|
||||
parserFactory({type: 'SentenceNode', tokenizer: 'tokenize'})
|
||||
)
|
||||
|
||||
// Create a `ParagraphNode` with its children set to `Node`s, their values set
|
||||
// to the tokenized given `value`.
|
||||
//
|
||||
// Unless plugins add new nodes, the paragraph is populated by `SentenceNode`s
|
||||
// and `WhiteSpaceNode`s.
|
||||
pluggable(
|
||||
ParseLatin,
|
||||
'tokenizeParagraph',
|
||||
parserFactory({
|
||||
type: 'ParagraphNode',
|
||||
delimiter: terminalMarker,
|
||||
delimiterType: 'PunctuationNode',
|
||||
tokenizer: 'tokenizeSentence'
|
||||
})
|
||||
)
|
||||
|
||||
// Create a `RootNode` with its children set to `Node`s, their values set to the
|
||||
// tokenized given `value`.
|
||||
pluggable(
|
||||
ParseLatin,
|
||||
'tokenizeRoot',
|
||||
parserFactory({
|
||||
type: 'RootNode',
|
||||
delimiter: newLine,
|
||||
delimiterType: 'WhiteSpaceNode',
|
||||
tokenizer: 'tokenizeParagraph'
|
||||
})
|
||||
)
|
||||
|
||||
// PLUGINS
|
||||
|
||||
ParseLatin.prototype.use('tokenizeSentence', [
|
||||
mergeInitialWordSymbol,
|
||||
mergeFinalWordSymbol,
|
||||
mergeInnerWordSymbol,
|
||||
mergeInnerWordSlash,
|
||||
mergeInitialisms,
|
||||
mergeWords,
|
||||
patchPosition
|
||||
])
|
||||
|
||||
ParseLatin.prototype.use('tokenizeParagraph', [
|
||||
mergeNonWordSentences,
|
||||
mergeAffixSymbol,
|
||||
mergeInitialLowerCaseLetterSentences,
|
||||
mergeInitialDigitSentences,
|
||||
mergePrefixExceptions,
|
||||
mergeAffixExceptions,
|
||||
mergeRemainingFullStops,
|
||||
makeInitialWhiteSpaceSiblings,
|
||||
makeFinalWhiteSpaceSiblings,
|
||||
breakImplicitSentences,
|
||||
removeEmptyNodes,
|
||||
patchPosition
|
||||
])
|
||||
|
||||
ParseLatin.prototype.use('tokenizeRoot', [
|
||||
makeInitialWhiteSpaceSiblings,
|
||||
makeFinalWhiteSpaceSiblings,
|
||||
removeEmptyNodes,
|
||||
patchPosition
|
||||
])
|
||||
|
||||
// TEXT NODES
|
||||
|
||||
// Factory to create a `Text`.
|
||||
function createTextFactory(type) {
|
||||
type += 'Node'
|
||||
|
||||
return createText
|
||||
|
||||
// Construct a `Text` from a bound `type`
|
||||
function createText(value, eat, parent) {
|
||||
if (value === null || value === undefined) {
|
||||
value = ''
|
||||
}
|
||||
|
||||
return (eat || noopEat)(value)({type, value: String(value)}, parent)
|
||||
}
|
||||
}
|
||||
|
||||
// Make a method “pluggable”.
|
||||
function pluggable(Constructor, key, callback) {
|
||||
// Set a pluggable version of `callback` on `Constructor`.
|
||||
Constructor.prototype[key] = function (...input) {
|
||||
return this.run(key, callback.apply(this, input))
|
||||
}
|
||||
}
|
||||
|
||||
// Factory to inject `plugins`. Takes `callback` for the actual inserting.
|
||||
function useFactory(callback) {
|
||||
return use
|
||||
|
||||
// Validate if `plugins` can be inserted.
|
||||
// Invokes the bound `callback` to do the actual inserting.
|
||||
function use(key, plugins) {
|
||||
// Throw if the method is not pluggable.
|
||||
if (!(key in this)) {
|
||||
throw new Error(
|
||||
'Illegal Invocation: Unsupported `key` for ' +
|
||||
'`use(key, plugins)`. Make sure `key` is a ' +
|
||||
'supported function'
|
||||
)
|
||||
}
|
||||
|
||||
// Fail silently when no plugins are given.
|
||||
if (!plugins) {
|
||||
return
|
||||
}
|
||||
|
||||
const wareKey = key + 'Plugins'
|
||||
|
||||
// Make sure `plugins` is a list.
|
||||
plugins = typeof plugins === 'function' ? [plugins] : plugins.concat()
|
||||
|
||||
// Make sure `wareKey` exists.
|
||||
if (!this[wareKey]) {
|
||||
this[wareKey] = []
|
||||
}
|
||||
|
||||
// Invoke callback with the ware key and plugins.
|
||||
callback(this, wareKey, plugins)
|
||||
}
|
||||
}
|
||||
|
||||
// Add mechanism used when text-tokenisers are called directly outside of the
|
||||
// `tokenize` function.
|
||||
function noopAdd(node, parent) {
|
||||
if (parent) {
|
||||
parent.children.push(node)
|
||||
}
|
||||
|
||||
return node
|
||||
}
|
||||
|
||||
// Eat and add mechanism without adding positional information, used when
|
||||
// text-tokenisers are called directly outside of the `tokenize` function.
|
||||
function noopEat() {
|
||||
return noopAdd
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue