🎉 initiate project *astro_rewrite*
This commit is contained in:
parent
ffd4d5e86c
commit
2ba37bfbe3
8658 changed files with 2268794 additions and 2538 deletions
1
node_modules/parse-latin/index.js
generated
vendored
Normal file
1
node_modules/parse-latin/index.js
generated
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
export {ParseLatin} from './lib/index.js'
|
||||
19
node_modules/parse-latin/lib/expressions.js
generated
vendored
Normal file
19
node_modules/parse-latin/lib/expressions.js
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
424
node_modules/parse-latin/lib/index.js
generated
vendored
Normal file
424
node_modules/parse-latin/lib/index.js
generated
vendored
Normal file
|
|
@ -0,0 +1,424 @@
|
|||
import {mergeInitialWordSymbol} from './plugin/merge-initial-word-symbol.js'
|
||||
import {mergeFinalWordSymbol} from './plugin/merge-final-word-symbol.js'
|
||||
import {mergeInnerWordSymbol} from './plugin/merge-inner-word-symbol.js'
|
||||
import {mergeInnerWordSlash} from './plugin/merge-inner-word-slash.js'
|
||||
import {mergeInitialisms} from './plugin/merge-initialisms.js'
|
||||
import {mergeWords} from './plugin/merge-words.js'
|
||||
import {patchPosition} from './plugin/patch-position.js'
|
||||
import {mergeNonWordSentences} from './plugin/merge-non-word-sentences.js'
|
||||
import {mergeAffixSymbol} from './plugin/merge-affix-symbol.js'
|
||||
import {mergeInitialLowerCaseLetterSentences} from './plugin/merge-initial-lower-case-letter-sentences.js'
|
||||
import {mergeInitialDigitSentences} from './plugin/merge-initial-digit-sentences.js'
|
||||
import {mergePrefixExceptions} from './plugin/merge-prefix-exceptions.js'
|
||||
import {mergeAffixExceptions} from './plugin/merge-affix-exceptions.js'
|
||||
import {mergeRemainingFullStops} from './plugin/merge-remaining-full-stops.js'
|
||||
import {makeInitialWhiteSpaceSiblings} from './plugin/make-initial-white-space-siblings.js'
|
||||
import {makeFinalWhiteSpaceSiblings} from './plugin/make-final-white-space-siblings.js'
|
||||
import {breakImplicitSentences} from './plugin/break-implicit-sentences.js'
|
||||
import {removeEmptyNodes} from './plugin/remove-empty-nodes.js'
|
||||
import {parserFactory} from './parser.js'
|
||||
import {
|
||||
newLine,
|
||||
punctuation,
|
||||
surrogates,
|
||||
terminalMarker,
|
||||
whiteSpace,
|
||||
word
|
||||
} from './expressions.js'
|
||||
|
||||
// PARSE LATIN
|
||||
|
||||
// Transform Latin-script natural language into an NLCST-tree.
|
||||
export class ParseLatin {
|
||||
constructor(doc, file) {
|
||||
const value = file || doc
|
||||
this.doc = value ? String(value) : null
|
||||
}
|
||||
|
||||
// Run transform plugins for `key` on `nodes`.
|
||||
run(key, nodes) {
|
||||
const wareKey = key + 'Plugins'
|
||||
const plugins = this[wareKey]
|
||||
let index = -1
|
||||
|
||||
if (plugins) {
|
||||
while (plugins[++index]) {
|
||||
plugins[index](nodes)
|
||||
}
|
||||
}
|
||||
|
||||
return nodes
|
||||
}
|
||||
|
||||
// Easy access to the document parser. This additionally supports retext-style
|
||||
// invocation: where an instance is created for each file, and the file is given
|
||||
// on construction.
|
||||
parse(value) {
|
||||
return this.tokenizeRoot(value || this.doc)
|
||||
}
|
||||
|
||||
// Transform a `value` into a list of `NLCSTNode`s.
|
||||
tokenize(value) {
|
||||
const tokens = []
|
||||
|
||||
if (value === null || value === undefined) {
|
||||
value = ''
|
||||
} else if (value instanceof String) {
|
||||
value = value.toString()
|
||||
}
|
||||
|
||||
if (typeof value !== 'string') {
|
||||
// Return the given nodes if this is either an empty array, or an array with
|
||||
// a node as a first child.
|
||||
if ('length' in value && (!value[0] || value[0].type)) {
|
||||
return value
|
||||
}
|
||||
|
||||
throw new Error(
|
||||
"Illegal invocation: '" +
|
||||
value +
|
||||
"' is not a valid argument for 'ParseLatin'"
|
||||
)
|
||||
}
|
||||
|
||||
if (!value) {
|
||||
return tokens
|
||||
}
|
||||
|
||||
// Eat mechanism to use.
|
||||
const eater = this.position ? eat : noPositionEat
|
||||
|
||||
let index = 0
|
||||
let offset = 0
|
||||
let line = 1
|
||||
let column = 1
|
||||
let previous = ''
|
||||
let queue = ''
|
||||
let left
|
||||
let right
|
||||
let character
|
||||
|
||||
while (index < value.length) {
|
||||
character = value.charAt(index)
|
||||
|
||||
if (whiteSpace.test(character)) {
|
||||
right = 'WhiteSpace'
|
||||
} else if (punctuation.test(character)) {
|
||||
right = 'Punctuation'
|
||||
} else if (word.test(character)) {
|
||||
right = 'Word'
|
||||
} else {
|
||||
right = 'Symbol'
|
||||
}
|
||||
|
||||
tick.call(this)
|
||||
|
||||
previous = character
|
||||
character = ''
|
||||
left = right
|
||||
right = null
|
||||
|
||||
index++
|
||||
}
|
||||
|
||||
tick.call(this)
|
||||
|
||||
return tokens
|
||||
|
||||
// Check one character.
|
||||
function tick() {
|
||||
if (
|
||||
left === right &&
|
||||
(left === 'Word' ||
|
||||
left === 'WhiteSpace' ||
|
||||
character === previous ||
|
||||
surrogates.test(character))
|
||||
) {
|
||||
queue += character
|
||||
} else {
|
||||
// Flush the previous queue.
|
||||
if (queue) {
|
||||
this['tokenize' + left](queue, eater)
|
||||
}
|
||||
|
||||
queue = character
|
||||
}
|
||||
}
|
||||
|
||||
// Remove `subvalue` from `value`.
|
||||
// Expects `subvalue` to be at the start from `value`, and applies no
|
||||
// validation.
|
||||
function eat(subvalue) {
|
||||
const pos = position()
|
||||
|
||||
update(subvalue)
|
||||
|
||||
return apply
|
||||
|
||||
// Add the given arguments, add `position` to the returned node, and return
|
||||
// the node.
|
||||
function apply(...input) {
|
||||
return pos(add(...input))
|
||||
}
|
||||
}
|
||||
|
||||
// Remove `subvalue` from `value`.
|
||||
// Does not patch positional information.
|
||||
function noPositionEat() {
|
||||
return add
|
||||
}
|
||||
|
||||
// Add mechanism.
|
||||
function add(node, parent) {
|
||||
if (parent) {
|
||||
parent.children.push(node)
|
||||
} else {
|
||||
tokens.push(node)
|
||||
}
|
||||
|
||||
return node
|
||||
}
|
||||
|
||||
// Mark position and patch `node.position`.
|
||||
function position() {
|
||||
const before = now()
|
||||
|
||||
// Add the position to a node.
|
||||
function patch(node) {
|
||||
node.position = new Position(before)
|
||||
|
||||
return node
|
||||
}
|
||||
|
||||
return patch
|
||||
}
|
||||
|
||||
// Update line and column based on `value`.
|
||||
function update(subvalue) {
|
||||
let character = -1
|
||||
let lastIndex = -1
|
||||
|
||||
offset += subvalue.length
|
||||
|
||||
while (++character < subvalue.length) {
|
||||
if (subvalue.charAt(character) === '\n') {
|
||||
lastIndex = character
|
||||
line++
|
||||
}
|
||||
}
|
||||
|
||||
if (lastIndex < 0) {
|
||||
column += subvalue.length
|
||||
} else {
|
||||
column = subvalue.length - lastIndex
|
||||
}
|
||||
}
|
||||
|
||||
// Store position information for a node.
|
||||
function Position(start) {
|
||||
this.start = start
|
||||
this.end = now()
|
||||
}
|
||||
|
||||
// Get the current position.
|
||||
function now() {
|
||||
return {line, column, offset}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Default position.
|
||||
ParseLatin.prototype.position = true
|
||||
|
||||
// Create text nodes.
|
||||
ParseLatin.prototype.tokenizeSymbol = createTextFactory('Symbol')
|
||||
ParseLatin.prototype.tokenizeWhiteSpace = createTextFactory('WhiteSpace')
|
||||
ParseLatin.prototype.tokenizePunctuation = createTextFactory('Punctuation')
|
||||
ParseLatin.prototype.tokenizeSource = createTextFactory('Source')
|
||||
ParseLatin.prototype.tokenizeText = createTextFactory('Text')
|
||||
|
||||
// Inject `plugins` to modifiy the result of the method at `key` on the operated
|
||||
// on context.
|
||||
ParseLatin.prototype.use = useFactory(function (context, key, plugins) {
|
||||
context[key] = context[key].concat(plugins)
|
||||
})
|
||||
|
||||
// Inject `plugins` to modifiy the result of the method at `key` on the operated
|
||||
// on context, before any other.
|
||||
ParseLatin.prototype.useFirst = useFactory(function (context, key, plugins) {
|
||||
context[key] = plugins.concat(context[key])
|
||||
})
|
||||
|
||||
// PARENT NODES
|
||||
//
|
||||
// All these nodes are `pluggable`: they come with a `use` method which accepts
|
||||
// a plugin (`function(NLCSTNode)`).
|
||||
// Every time one of these methods are called, the plugin is invoked with the
|
||||
// node, allowing for easy modification.
|
||||
//
|
||||
// In fact, the internal transformation from `tokenize` (a list of words, white
|
||||
// space, punctuation, and symbols) to `tokenizeRoot` (an NLCST tree), is also
|
||||
// implemented through this mechanism.
|
||||
|
||||
// Create a `WordNode` with its children set to a single `TextNode`, its value
|
||||
// set to the given `value`.
|
||||
pluggable(ParseLatin, 'tokenizeWord', function (value, eat) {
|
||||
const add = (eat || noopEat)('')
|
||||
const parent = {type: 'WordNode', children: []}
|
||||
|
||||
this.tokenizeText(value, eat, parent)
|
||||
|
||||
return add(parent)
|
||||
})
|
||||
|
||||
// Create a `SentenceNode` with its children set to `Node`s, their values set
|
||||
// to the tokenized given `value`.
|
||||
//
|
||||
// Unless plugins add new nodes, the sentence is populated by `WordNode`s,
|
||||
// `SymbolNode`s, `PunctuationNode`s, and `WhiteSpaceNode`s.
|
||||
pluggable(
|
||||
ParseLatin,
|
||||
'tokenizeSentence',
|
||||
parserFactory({type: 'SentenceNode', tokenizer: 'tokenize'})
|
||||
)
|
||||
|
||||
// Create a `ParagraphNode` with its children set to `Node`s, their values set
|
||||
// to the tokenized given `value`.
|
||||
//
|
||||
// Unless plugins add new nodes, the paragraph is populated by `SentenceNode`s
|
||||
// and `WhiteSpaceNode`s.
|
||||
pluggable(
|
||||
ParseLatin,
|
||||
'tokenizeParagraph',
|
||||
parserFactory({
|
||||
type: 'ParagraphNode',
|
||||
delimiter: terminalMarker,
|
||||
delimiterType: 'PunctuationNode',
|
||||
tokenizer: 'tokenizeSentence'
|
||||
})
|
||||
)
|
||||
|
||||
// Create a `RootNode` with its children set to `Node`s, their values set to the
|
||||
// tokenized given `value`.
|
||||
pluggable(
|
||||
ParseLatin,
|
||||
'tokenizeRoot',
|
||||
parserFactory({
|
||||
type: 'RootNode',
|
||||
delimiter: newLine,
|
||||
delimiterType: 'WhiteSpaceNode',
|
||||
tokenizer: 'tokenizeParagraph'
|
||||
})
|
||||
)
|
||||
|
||||
// PLUGINS
|
||||
|
||||
ParseLatin.prototype.use('tokenizeSentence', [
|
||||
mergeInitialWordSymbol,
|
||||
mergeFinalWordSymbol,
|
||||
mergeInnerWordSymbol,
|
||||
mergeInnerWordSlash,
|
||||
mergeInitialisms,
|
||||
mergeWords,
|
||||
patchPosition
|
||||
])
|
||||
|
||||
ParseLatin.prototype.use('tokenizeParagraph', [
|
||||
mergeNonWordSentences,
|
||||
mergeAffixSymbol,
|
||||
mergeInitialLowerCaseLetterSentences,
|
||||
mergeInitialDigitSentences,
|
||||
mergePrefixExceptions,
|
||||
mergeAffixExceptions,
|
||||
mergeRemainingFullStops,
|
||||
makeInitialWhiteSpaceSiblings,
|
||||
makeFinalWhiteSpaceSiblings,
|
||||
breakImplicitSentences,
|
||||
removeEmptyNodes,
|
||||
patchPosition
|
||||
])
|
||||
|
||||
ParseLatin.prototype.use('tokenizeRoot', [
|
||||
makeInitialWhiteSpaceSiblings,
|
||||
makeFinalWhiteSpaceSiblings,
|
||||
removeEmptyNodes,
|
||||
patchPosition
|
||||
])
|
||||
|
||||
// TEXT NODES
|
||||
|
||||
// Factory to create a `Text`.
|
||||
function createTextFactory(type) {
|
||||
type += 'Node'
|
||||
|
||||
return createText
|
||||
|
||||
// Construct a `Text` from a bound `type`
|
||||
function createText(value, eat, parent) {
|
||||
if (value === null || value === undefined) {
|
||||
value = ''
|
||||
}
|
||||
|
||||
return (eat || noopEat)(value)({type, value: String(value)}, parent)
|
||||
}
|
||||
}
|
||||
|
||||
// Make a method “pluggable”.
|
||||
function pluggable(Constructor, key, callback) {
|
||||
// Set a pluggable version of `callback` on `Constructor`.
|
||||
Constructor.prototype[key] = function (...input) {
|
||||
return this.run(key, callback.apply(this, input))
|
||||
}
|
||||
}
|
||||
|
||||
// Factory to inject `plugins`. Takes `callback` for the actual inserting.
|
||||
function useFactory(callback) {
|
||||
return use
|
||||
|
||||
// Validate if `plugins` can be inserted.
|
||||
// Invokes the bound `callback` to do the actual inserting.
|
||||
function use(key, plugins) {
|
||||
// Throw if the method is not pluggable.
|
||||
if (!(key in this)) {
|
||||
throw new Error(
|
||||
'Illegal Invocation: Unsupported `key` for ' +
|
||||
'`use(key, plugins)`. Make sure `key` is a ' +
|
||||
'supported function'
|
||||
)
|
||||
}
|
||||
|
||||
// Fail silently when no plugins are given.
|
||||
if (!plugins) {
|
||||
return
|
||||
}
|
||||
|
||||
const wareKey = key + 'Plugins'
|
||||
|
||||
// Make sure `plugins` is a list.
|
||||
plugins = typeof plugins === 'function' ? [plugins] : plugins.concat()
|
||||
|
||||
// Make sure `wareKey` exists.
|
||||
if (!this[wareKey]) {
|
||||
this[wareKey] = []
|
||||
}
|
||||
|
||||
// Invoke callback with the ware key and plugins.
|
||||
callback(this, wareKey, plugins)
|
||||
}
|
||||
}
|
||||
|
||||
// Add mechanism used when text-tokenisers are called directly outside of the
|
||||
// `tokenize` function.
|
||||
function noopAdd(node, parent) {
|
||||
if (parent) {
|
||||
parent.children.push(node)
|
||||
}
|
||||
|
||||
return node
|
||||
}
|
||||
|
||||
// Eat and add mechanism without adding positional information, used when
|
||||
// text-tokenisers are called directly outside of the `tokenize` function.
|
||||
function noopEat() {
|
||||
return noopAdd
|
||||
}
|
||||
18
node_modules/parse-latin/lib/parser.js
generated
vendored
Normal file
18
node_modules/parse-latin/lib/parser.js
generated
vendored
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
import {tokenizerFactory} from './tokenizer.js'
|
||||
|
||||
// Construct a parser based on `options`.
|
||||
export function parserFactory(options) {
|
||||
const type = options.type
|
||||
const tokenizerProperty = options.tokenizer
|
||||
const delimiter = options.delimiter
|
||||
const tokenize =
|
||||
delimiter && tokenizerFactory(options.delimiterType, delimiter)
|
||||
|
||||
return parser
|
||||
|
||||
function parser(value) {
|
||||
const children = this[tokenizerProperty](value)
|
||||
|
||||
return {type, children: tokenize ? tokenize(children) : children}
|
||||
}
|
||||
}
|
||||
51
node_modules/parse-latin/lib/plugin/break-implicit-sentences.js
generated
vendored
Normal file
51
node_modules/parse-latin/lib/plugin/break-implicit-sentences.js
generated
vendored
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
import {toString} from 'nlcst-to-string'
|
||||
import {modifyChildren} from 'unist-util-modify-children'
|
||||
|
||||
// Break a sentence if a white space with more than one new-line is found.
|
||||
export const breakImplicitSentences = modifyChildren(function (
|
||||
child,
|
||||
index,
|
||||
parent
|
||||
) {
|
||||
if (child.type !== 'SentenceNode') {
|
||||
return
|
||||
}
|
||||
|
||||
const children = child.children
|
||||
|
||||
// Ignore first and last child.
|
||||
let position = 0
|
||||
|
||||
while (++position < children.length - 1) {
|
||||
const node = children[position]
|
||||
|
||||
if (
|
||||
node.type !== 'WhiteSpaceNode' ||
|
||||
toString(node).split(/\r\n|\r|\n/).length < 3
|
||||
) {
|
||||
continue
|
||||
}
|
||||
|
||||
child.children = children.slice(0, position)
|
||||
|
||||
const insertion = {
|
||||
type: 'SentenceNode',
|
||||
children: children.slice(position + 1)
|
||||
}
|
||||
|
||||
const tail = children[position - 1]
|
||||
const head = children[position + 1]
|
||||
|
||||
parent.children.splice(index + 1, 0, node, insertion)
|
||||
|
||||
if (child.position && tail.position && head.position) {
|
||||
const end = child.position.end
|
||||
|
||||
child.position.end = tail.position.end
|
||||
|
||||
insertion.position = {start: head.position.start, end}
|
||||
}
|
||||
|
||||
return index + 1
|
||||
}
|
||||
})
|
||||
27
node_modules/parse-latin/lib/plugin/make-final-white-space-siblings.js
generated
vendored
Normal file
27
node_modules/parse-latin/lib/plugin/make-final-white-space-siblings.js
generated
vendored
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
import {modifyChildren} from 'unist-util-modify-children'
|
||||
|
||||
// Move white space ending a paragraph up, so they are the siblings of
|
||||
// paragraphs.
|
||||
export const makeFinalWhiteSpaceSiblings = modifyChildren(function (
|
||||
child,
|
||||
index,
|
||||
parent
|
||||
) {
|
||||
const children = child.children
|
||||
|
||||
if (
|
||||
children &&
|
||||
children.length > 0 &&
|
||||
children[children.length - 1].type === 'WhiteSpaceNode'
|
||||
) {
|
||||
parent.children.splice(index + 1, 0, child.children.pop())
|
||||
const previous = children[children.length - 1]
|
||||
|
||||
if (previous && previous.position && child.position) {
|
||||
child.position.end = previous.position.end
|
||||
}
|
||||
|
||||
// Next, iterate over the current node again.
|
||||
return index
|
||||
}
|
||||
})
|
||||
23
node_modules/parse-latin/lib/plugin/make-initial-white-space-siblings.js
generated
vendored
Normal file
23
node_modules/parse-latin/lib/plugin/make-initial-white-space-siblings.js
generated
vendored
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
import {visitChildren} from 'unist-util-visit-children'
|
||||
|
||||
// Move white space starting a sentence up, so they are the siblings of
|
||||
// sentences.
|
||||
export const makeInitialWhiteSpaceSiblings = visitChildren(function (
|
||||
child,
|
||||
index,
|
||||
parent
|
||||
) {
|
||||
const children = child.children
|
||||
if (
|
||||
children &&
|
||||
children.length > 0 &&
|
||||
children[0].type === 'WhiteSpaceNode'
|
||||
) {
|
||||
parent.children.splice(index, 0, children.shift())
|
||||
const next = children[0]
|
||||
|
||||
if (next && next.position && child.position) {
|
||||
child.position.start = next.position.start
|
||||
}
|
||||
}
|
||||
})
|
||||
47
node_modules/parse-latin/lib/plugin/merge-affix-exceptions.js
generated
vendored
Normal file
47
node_modules/parse-latin/lib/plugin/merge-affix-exceptions.js
generated
vendored
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
import {toString} from 'nlcst-to-string'
|
||||
import {modifyChildren} from 'unist-util-modify-children'
|
||||
|
||||
// Merge a sentence into its previous sentence, when the sentence starts with a
|
||||
// comma.
|
||||
export const mergeAffixExceptions = modifyChildren(function (
|
||||
child,
|
||||
index,
|
||||
parent
|
||||
) {
|
||||
const children = child.children
|
||||
|
||||
if (!children || children.length === 0 || index < 1) {
|
||||
return
|
||||
}
|
||||
|
||||
let position = -1
|
||||
|
||||
while (children[++position]) {
|
||||
const node = children[position]
|
||||
|
||||
if (node.type === 'WordNode') {
|
||||
return
|
||||
}
|
||||
|
||||
if (node.type === 'SymbolNode' || node.type === 'PunctuationNode') {
|
||||
const value = toString(node)
|
||||
|
||||
if (value !== ',' && value !== ';') {
|
||||
return
|
||||
}
|
||||
|
||||
const previousChild = parent.children[index - 1]
|
||||
previousChild.children = previousChild.children.concat(children)
|
||||
|
||||
// Update position.
|
||||
if (previousChild.position && child.position) {
|
||||
previousChild.position.end = child.position.end
|
||||
}
|
||||
|
||||
parent.children.splice(index, 1)
|
||||
|
||||
// Next, iterate over the node *now* at the current position.
|
||||
return index
|
||||
}
|
||||
}
|
||||
})
|
||||
38
node_modules/parse-latin/lib/plugin/merge-affix-symbol.js
generated
vendored
Normal file
38
node_modules/parse-latin/lib/plugin/merge-affix-symbol.js
generated
vendored
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
import {toString} from 'nlcst-to-string'
|
||||
import {modifyChildren} from 'unist-util-modify-children'
|
||||
|
||||
// Closing or final punctuation, or terminal markers that should still be
|
||||
// included in the previous sentence, even though they follow the sentence’s
|
||||
// terminal marker.
|
||||
import {affixSymbol} from '../expressions.js'
|
||||
|
||||
// Move certain punctuation following a terminal marker (thus in the next
|
||||
// sentence) to the previous sentence.
|
||||
export const mergeAffixSymbol = modifyChildren(function (child, index, parent) {
|
||||
const children = child.children
|
||||
|
||||
if (children && children.length > 0 && index > 0) {
|
||||
const first = children[0]
|
||||
const second = children[1]
|
||||
const previous = parent.children[index - 1]
|
||||
|
||||
if (
|
||||
(first.type === 'SymbolNode' || first.type === 'PunctuationNode') &&
|
||||
affixSymbol.test(toString(first))
|
||||
) {
|
||||
previous.children.push(children.shift())
|
||||
|
||||
// Update position.
|
||||
if (first.position && previous.position) {
|
||||
previous.position.end = first.position.end
|
||||
}
|
||||
|
||||
if (second && second.position && child.position) {
|
||||
child.position.start = second.position.start
|
||||
}
|
||||
|
||||
// Next, iterate over the previous node again.
|
||||
return index - 1
|
||||
}
|
||||
}
|
||||
})
|
||||
40
node_modules/parse-latin/lib/plugin/merge-final-word-symbol.js
generated
vendored
Normal file
40
node_modules/parse-latin/lib/plugin/merge-final-word-symbol.js
generated
vendored
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
import {toString} from 'nlcst-to-string'
|
||||
import {modifyChildren} from 'unist-util-modify-children'
|
||||
|
||||
// Merge certain punctuation marks into their preceding words.
|
||||
export const mergeFinalWordSymbol = modifyChildren(function (
|
||||
child,
|
||||
index,
|
||||
parent
|
||||
) {
|
||||
if (
|
||||
index > 0 &&
|
||||
(child.type === 'SymbolNode' || child.type === 'PunctuationNode') &&
|
||||
toString(child) === '-'
|
||||
) {
|
||||
const children = parent.children
|
||||
const previous = children[index - 1]
|
||||
const next = children[index + 1]
|
||||
|
||||
if (
|
||||
(!next || next.type !== 'WordNode') &&
|
||||
previous &&
|
||||
previous.type === 'WordNode'
|
||||
) {
|
||||
// Remove `child` from parent.
|
||||
children.splice(index, 1)
|
||||
|
||||
// Add the punctuation mark at the end of the previous node.
|
||||
previous.children.push(child)
|
||||
|
||||
// Update position.
|
||||
if (previous.position && child.position) {
|
||||
previous.position.end = child.position.end
|
||||
}
|
||||
|
||||
// Next, iterate over the node *now* at the current position (which was
|
||||
// the next node).
|
||||
return index
|
||||
}
|
||||
}
|
||||
})
|
||||
34
node_modules/parse-latin/lib/plugin/merge-initial-digit-sentences.js
generated
vendored
Normal file
34
node_modules/parse-latin/lib/plugin/merge-initial-digit-sentences.js
generated
vendored
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
import {toString} from 'nlcst-to-string'
|
||||
import {modifyChildren} from 'unist-util-modify-children'
|
||||
import {digitStart} from '../expressions.js'
|
||||
|
||||
// Merge a sentence into its previous sentence, when the sentence starts with a
|
||||
// lower case letter.
|
||||
export const mergeInitialDigitSentences = modifyChildren(function (
|
||||
child,
|
||||
index,
|
||||
parent
|
||||
) {
|
||||
const children = child.children
|
||||
const siblings = parent.children
|
||||
const previous = siblings[index - 1]
|
||||
const head = children[0]
|
||||
|
||||
if (
|
||||
previous &&
|
||||
head &&
|
||||
head.type === 'WordNode' &&
|
||||
digitStart.test(toString(head))
|
||||
) {
|
||||
previous.children = previous.children.concat(children)
|
||||
siblings.splice(index, 1)
|
||||
|
||||
// Update position.
|
||||
if (previous.position && child.position) {
|
||||
previous.position.end = child.position.end
|
||||
}
|
||||
|
||||
// Next, iterate over the node *now* at the current position.
|
||||
return index
|
||||
}
|
||||
})
|
||||
48
node_modules/parse-latin/lib/plugin/merge-initial-lower-case-letter-sentences.js
generated
vendored
Normal file
48
node_modules/parse-latin/lib/plugin/merge-initial-lower-case-letter-sentences.js
generated
vendored
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
import {toString} from 'nlcst-to-string'
|
||||
import {modifyChildren} from 'unist-util-modify-children'
|
||||
|
||||
// Initial lowercase letter.
|
||||
import {lowerInitial} from '../expressions.js'
|
||||
|
||||
// Merge a sentence into its previous sentence, when the sentence starts with a
|
||||
// lower case letter.
|
||||
export const mergeInitialLowerCaseLetterSentences = modifyChildren(function (
|
||||
child,
|
||||
index,
|
||||
parent
|
||||
) {
|
||||
const children = child.children
|
||||
|
||||
if (children && children.length > 0 && index > 0) {
|
||||
let position = -1
|
||||
|
||||
while (children[++position]) {
|
||||
const node = children[position]
|
||||
|
||||
if (node.type === 'WordNode') {
|
||||
if (!lowerInitial.test(toString(node))) {
|
||||
return
|
||||
}
|
||||
|
||||
const siblings = parent.children
|
||||
const previous = siblings[index - 1]
|
||||
|
||||
previous.children = previous.children.concat(children)
|
||||
|
||||
siblings.splice(index, 1)
|
||||
|
||||
// Update position.
|
||||
if (previous.position && child.position) {
|
||||
previous.position.end = child.position.end
|
||||
}
|
||||
|
||||
// Next, iterate over the node *now* at the current position.
|
||||
return index
|
||||
}
|
||||
|
||||
if (node.type === 'SymbolNode' || node.type === 'PunctuationNode') {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
42
node_modules/parse-latin/lib/plugin/merge-initial-word-symbol.js
generated
vendored
Normal file
42
node_modules/parse-latin/lib/plugin/merge-initial-word-symbol.js
generated
vendored
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
import {toString} from 'nlcst-to-string'
|
||||
import {modifyChildren} from 'unist-util-modify-children'
|
||||
|
||||
// Merge certain punctuation marks into their following words.
|
||||
export const mergeInitialWordSymbol = modifyChildren(function (
|
||||
child,
|
||||
index,
|
||||
parent
|
||||
) {
|
||||
if (
|
||||
(child.type !== 'SymbolNode' && child.type !== 'PunctuationNode') ||
|
||||
toString(child) !== '&'
|
||||
) {
|
||||
return
|
||||
}
|
||||
|
||||
const children = parent.children
|
||||
const next = children[index + 1]
|
||||
|
||||
// If either a previous word, or no following word, exists, exit early.
|
||||
if (
|
||||
(index > 0 && children[index - 1].type === 'WordNode') ||
|
||||
!(next && next.type === 'WordNode')
|
||||
) {
|
||||
return
|
||||
}
|
||||
|
||||
// Remove `child` from parent.
|
||||
children.splice(index, 1)
|
||||
|
||||
// Add the punctuation mark at the start of the next node.
|
||||
next.children.unshift(child)
|
||||
|
||||
// Update position.
|
||||
if (next.position && child.position) {
|
||||
next.position.start = child.position.start
|
||||
}
|
||||
|
||||
// Next, iterate over the node at the previous position, as it's now adjacent
|
||||
// to a following word.
|
||||
return index - 1
|
||||
})
|
||||
62
node_modules/parse-latin/lib/plugin/merge-initialisms.js
generated
vendored
Normal file
62
node_modules/parse-latin/lib/plugin/merge-initialisms.js
generated
vendored
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
import {toString} from 'nlcst-to-string'
|
||||
import {modifyChildren} from 'unist-util-modify-children'
|
||||
import {numerical} from '../expressions.js'
|
||||
|
||||
// Merge initialisms.
|
||||
export const mergeInitialisms = modifyChildren(function (child, index, parent) {
|
||||
if (index > 0 && toString(child) === '.') {
|
||||
const siblings = parent.children
|
||||
|
||||
const previous = siblings[index - 1]
|
||||
const children = previous.children
|
||||
|
||||
if (
|
||||
previous.type === 'WordNode' &&
|
||||
children &&
|
||||
children.length !== 1 &&
|
||||
children.length % 2 !== 0
|
||||
) {
|
||||
let position = children.length
|
||||
let isAllDigits = true
|
||||
|
||||
while (children[--position]) {
|
||||
const otherChild = children[position]
|
||||
|
||||
const value = toString(otherChild)
|
||||
|
||||
if (position % 2 === 0) {
|
||||
// Initialisms consist of one character values.
|
||||
if (value.length > 1) {
|
||||
return
|
||||
}
|
||||
|
||||
if (!numerical.test(value)) {
|
||||
isAllDigits = false
|
||||
}
|
||||
} else if (value !== '.') {
|
||||
if (position < children.length - 2) {
|
||||
break
|
||||
} else {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!isAllDigits) {
|
||||
// Remove `child` from parent.
|
||||
siblings.splice(index, 1)
|
||||
|
||||
// Add child to the previous children.
|
||||
children.push(child)
|
||||
|
||||
// Update position.
|
||||
if (previous.position && child.position) {
|
||||
previous.position.end = child.position.end
|
||||
}
|
||||
|
||||
// Next, iterate over the node *now* at the current position.
|
||||
return index
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
50
node_modules/parse-latin/lib/plugin/merge-inner-word-slash.js
generated
vendored
Normal file
50
node_modules/parse-latin/lib/plugin/merge-inner-word-slash.js
generated
vendored
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
import {toString} from 'nlcst-to-string'
|
||||
import {modifyChildren} from 'unist-util-modify-children'
|
||||
|
||||
const slash = '/'
|
||||
|
||||
// Merge words joined by certain punctuation marks.
|
||||
export const mergeInnerWordSlash = modifyChildren(function (
|
||||
child,
|
||||
index,
|
||||
parent
|
||||
) {
|
||||
const siblings = parent.children
|
||||
const previous = siblings[index - 1]
|
||||
const next = siblings[index + 1]
|
||||
|
||||
if (
|
||||
previous &&
|
||||
previous.type === 'WordNode' &&
|
||||
(child.type === 'SymbolNode' || child.type === 'PunctuationNode') &&
|
||||
toString(child) === slash
|
||||
) {
|
||||
const previousValue = toString(previous)
|
||||
let tail = child
|
||||
let queue = [child]
|
||||
let count = 1
|
||||
let nextValue = ''
|
||||
|
||||
if (next && next.type === 'WordNode') {
|
||||
nextValue = toString(next)
|
||||
tail = next
|
||||
queue = queue.concat(next.children)
|
||||
count++
|
||||
}
|
||||
|
||||
if (previousValue.length < 3 && (!nextValue || nextValue.length < 3)) {
|
||||
// Add all found tokens to `prev`s children.
|
||||
previous.children = previous.children.concat(queue)
|
||||
|
||||
siblings.splice(index, count)
|
||||
|
||||
// Update position.
|
||||
if (previous.position && tail.position) {
|
||||
previous.position.end = tail.position.end
|
||||
}
|
||||
|
||||
// Next, iterate over the node *now* at the current position.
|
||||
return index
|
||||
}
|
||||
}
|
||||
})
|
||||
72
node_modules/parse-latin/lib/plugin/merge-inner-word-symbol.js
generated
vendored
Normal file
72
node_modules/parse-latin/lib/plugin/merge-inner-word-symbol.js
generated
vendored
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
import {toString} from 'nlcst-to-string'
|
||||
import {modifyChildren} from 'unist-util-modify-children'
|
||||
|
||||
// Symbols part of surrounding words.
|
||||
import {wordSymbolInner} from '../expressions.js'
|
||||
|
||||
// Merge words joined by certain punctuation marks.
|
||||
export const mergeInnerWordSymbol = modifyChildren(function (
|
||||
child,
|
||||
index,
|
||||
parent
|
||||
) {
|
||||
if (
|
||||
index > 0 &&
|
||||
(child.type === 'SymbolNode' || child.type === 'PunctuationNode')
|
||||
) {
|
||||
const siblings = parent.children
|
||||
const previous = siblings[index - 1]
|
||||
|
||||
if (previous && previous.type === 'WordNode') {
|
||||
let position = index - 1
|
||||
let tokens = []
|
||||
let queue = []
|
||||
|
||||
// - If a token which is neither word nor inner word symbol is found,
|
||||
// the loop is broken
|
||||
// - If an inner word symbol is found, it’s queued
|
||||
// - If a word is found, it’s queued (and the queue stored and emptied)
|
||||
while (siblings[++position]) {
|
||||
const sibling = siblings[position]
|
||||
|
||||
if (sibling.type === 'WordNode') {
|
||||
tokens = tokens.concat(queue, sibling.children)
|
||||
|
||||
queue = []
|
||||
} else if (
|
||||
(sibling.type === 'SymbolNode' ||
|
||||
sibling.type === 'PunctuationNode') &&
|
||||
wordSymbolInner.test(toString(sibling))
|
||||
) {
|
||||
queue.push(sibling)
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if (tokens.length > 0) {
|
||||
// If there is a queue, remove its length from `position`.
|
||||
if (queue.length > 0) {
|
||||
position -= queue.length
|
||||
}
|
||||
|
||||
// Remove every (one or more) inner-word punctuation marks and children
|
||||
// of words.
|
||||
siblings.splice(index, position - index)
|
||||
|
||||
// Add all found tokens to `prev`s children.
|
||||
previous.children = previous.children.concat(tokens)
|
||||
|
||||
const last = tokens[tokens.length - 1]
|
||||
|
||||
// Update position.
|
||||
if (previous.position && last.position) {
|
||||
previous.position.end = last.position.end
|
||||
}
|
||||
|
||||
// Next, iterate over the node *now* at the current position.
|
||||
return index
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
50
node_modules/parse-latin/lib/plugin/merge-non-word-sentences.js
generated
vendored
Normal file
50
node_modules/parse-latin/lib/plugin/merge-non-word-sentences.js
generated
vendored
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
import {modifyChildren} from 'unist-util-modify-children'
|
||||
|
||||
// Merge a sentence into the following sentence, when the sentence does not
|
||||
// contain word tokens.
|
||||
export const mergeNonWordSentences = modifyChildren(function (
|
||||
child,
|
||||
index,
|
||||
parent
|
||||
) {
|
||||
const children = child.children
|
||||
let position = -1
|
||||
|
||||
while (children[++position]) {
|
||||
if (children[position].type === 'WordNode') {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
const previous = parent.children[index - 1]
|
||||
|
||||
if (previous) {
|
||||
previous.children = previous.children.concat(children)
|
||||
|
||||
// Remove the child.
|
||||
parent.children.splice(index, 1)
|
||||
|
||||
// Patch position.
|
||||
if (previous.position && child.position) {
|
||||
previous.position.end = child.position.end
|
||||
}
|
||||
|
||||
// Next, iterate over the node *now* at the current position (which was the
|
||||
// next node).
|
||||
return index
|
||||
}
|
||||
|
||||
const next = parent.children[index + 1]
|
||||
|
||||
if (next) {
|
||||
next.children = children.concat(next.children)
|
||||
|
||||
// Patch position.
|
||||
if (next.position && child.position) {
|
||||
next.position.start = child.position.start
|
||||
}
|
||||
|
||||
// Remove the child.
|
||||
parent.children.splice(index, 1)
|
||||
}
|
||||
})
|
||||
72
node_modules/parse-latin/lib/plugin/merge-prefix-exceptions.js
generated
vendored
Normal file
72
node_modules/parse-latin/lib/plugin/merge-prefix-exceptions.js
generated
vendored
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
import {toString} from 'nlcst-to-string'
|
||||
import {modifyChildren} from 'unist-util-modify-children'
|
||||
|
||||
// Full stop characters that should not be treated as terminal sentence markers:
|
||||
// A case-insensitive abbreviation.
|
||||
const abbreviationPrefix = new RegExp(
|
||||
'^(' +
|
||||
'[0-9]{1,3}|' +
|
||||
'[a-z]|' +
|
||||
// Common Latin Abbreviations:
|
||||
// Based on: <https://en.wikipedia.org/wiki/List_of_Latin_abbreviations>.
|
||||
// Where only the abbreviations written without joining full stops,
|
||||
// but with a final full stop, were extracted.
|
||||
//
|
||||
// circa, capitulus, confer, compare, centum weight, eadem, (et) alii,
|
||||
// et cetera, floruit, foliis, ibidem, idem, nemine && contradicente,
|
||||
// opere && citato, (per) cent, (per) procurationem, (pro) tempore,
|
||||
// sic erat scriptum, (et) sequentia, statim, videlicet. */
|
||||
'al|ca|cap|cca|cent|cf|cit|con|cp|cwt|ead|etc|ff|' +
|
||||
'fl|ibid|id|nem|op|pro|seq|sic|stat|tem|viz' +
|
||||
')$'
|
||||
)
|
||||
|
||||
// Merge a sentence into its next sentence, when the sentence ends with a
|
||||
// certain word.
|
||||
export const mergePrefixExceptions = modifyChildren(function (
|
||||
child,
|
||||
index,
|
||||
parent
|
||||
) {
|
||||
const children = child.children
|
||||
|
||||
if (children && children.length > 1) {
|
||||
const period = children[children.length - 1]
|
||||
|
||||
if (period && toString(period) === '.') {
|
||||
const node = children[children.length - 2]
|
||||
|
||||
if (
|
||||
node &&
|
||||
node.type === 'WordNode' &&
|
||||
abbreviationPrefix.test(toString(node).toLowerCase())
|
||||
) {
|
||||
// Merge period into abbreviation.
|
||||
node.children.push(period)
|
||||
children.pop()
|
||||
|
||||
// Update position.
|
||||
if (period.position && node.position) {
|
||||
node.position.end = period.position.end
|
||||
}
|
||||
|
||||
// Merge sentences.
|
||||
const next = parent.children[index + 1]
|
||||
|
||||
if (next) {
|
||||
child.children = children.concat(next.children)
|
||||
|
||||
parent.children.splice(index + 1, 1)
|
||||
|
||||
// Update position.
|
||||
if (next.position && child.position) {
|
||||
child.position.end = next.position.end
|
||||
}
|
||||
|
||||
// Next, iterate over the current node again.
|
||||
return index - 1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
90
node_modules/parse-latin/lib/plugin/merge-remaining-full-stops.js
generated
vendored
Normal file
90
node_modules/parse-latin/lib/plugin/merge-remaining-full-stops.js
generated
vendored
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
import {toString} from 'nlcst-to-string'
|
||||
import {visitChildren} from 'unist-util-visit-children'
|
||||
|
||||
// Full stop characters that should not be treated as terminal sentence markers:
|
||||
// A case-insensitive abbreviation.
|
||||
import {terminalMarker} from '../expressions.js'
|
||||
|
||||
// Merge non-terminal-marker full stops into the previous word (if available),
|
||||
// or the next word (if available).
|
||||
export const mergeRemainingFullStops = visitChildren(function (child) {
|
||||
const children = child.children
|
||||
let position = children.length
|
||||
let hasFoundDelimiter = false
|
||||
|
||||
while (children[--position]) {
|
||||
const grandchild = children[position]
|
||||
|
||||
if (
|
||||
grandchild.type !== 'SymbolNode' &&
|
||||
grandchild.type !== 'PunctuationNode'
|
||||
) {
|
||||
// This is a sentence without terminal marker, so we 'fool' the code to
|
||||
// make it think we have found one.
|
||||
if (grandchild.type === 'WordNode') {
|
||||
hasFoundDelimiter = true
|
||||
}
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
// Exit when this token is not a terminal marker.
|
||||
if (!terminalMarker.test(toString(grandchild))) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Ignore the first terminal marker found (starting at the end), as it
|
||||
// should not be merged.
|
||||
if (!hasFoundDelimiter) {
|
||||
hasFoundDelimiter = true
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
// Only merge a single full stop.
|
||||
if (toString(grandchild) !== '.') {
|
||||
continue
|
||||
}
|
||||
|
||||
const previous = children[position - 1]
|
||||
const next = children[position + 1]
|
||||
|
||||
if (previous && previous.type === 'WordNode') {
|
||||
const nextNext = children[position + 2]
|
||||
|
||||
// Continue when the full stop is followed by a space and another full
|
||||
// stop, such as: `{.} .`
|
||||
if (
|
||||
next &&
|
||||
nextNext &&
|
||||
next.type === 'WhiteSpaceNode' &&
|
||||
toString(nextNext) === '.'
|
||||
) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Remove `child` from parent.
|
||||
children.splice(position, 1)
|
||||
|
||||
// Add the punctuation mark at the end of the previous node.
|
||||
previous.children.push(grandchild)
|
||||
|
||||
// Update position.
|
||||
if (grandchild.position && previous.position) {
|
||||
previous.position.end = grandchild.position.end
|
||||
}
|
||||
|
||||
position--
|
||||
} else if (next && next.type === 'WordNode') {
|
||||
// Remove `child` from parent.
|
||||
children.splice(position, 1)
|
||||
|
||||
// Add the punctuation mark at the start of the next node.
|
||||
next.children.unshift(grandchild)
|
||||
|
||||
if (grandchild.position && next.position) {
|
||||
next.position.start = grandchild.position.start
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
28
node_modules/parse-latin/lib/plugin/merge-words.js
generated
vendored
Normal file
28
node_modules/parse-latin/lib/plugin/merge-words.js
generated
vendored
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
import {modifyChildren} from 'unist-util-modify-children'
|
||||
|
||||
// Merge multiple words. This merges the children of adjacent words, something
|
||||
// which should not occur naturally by parse-latin, but might happen when custom
|
||||
// tokens were passed in.
|
||||
export const mergeWords = modifyChildren(function (child, index, parent) {
|
||||
const siblings = parent.children
|
||||
|
||||
if (child.type === 'WordNode') {
|
||||
const next = siblings[index + 1]
|
||||
|
||||
if (next && next.type === 'WordNode') {
|
||||
// Remove `next` from parent.
|
||||
siblings.splice(index + 1, 1)
|
||||
|
||||
// Add the punctuation mark at the end of the previous node.
|
||||
child.children = child.children.concat(next.children)
|
||||
|
||||
// Update position.
|
||||
if (next.position && child.position) {
|
||||
child.position.end = next.position.end
|
||||
}
|
||||
|
||||
// Next, re-iterate the current node.
|
||||
return index
|
||||
}
|
||||
}
|
||||
})
|
||||
31
node_modules/parse-latin/lib/plugin/patch-position.js
generated
vendored
Normal file
31
node_modules/parse-latin/lib/plugin/patch-position.js
generated
vendored
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
import {visitChildren} from 'unist-util-visit-children'
|
||||
|
||||
// Patch the position on a parent node based on its first and last child.
|
||||
export const patchPosition = visitChildren(function (child, index, node) {
|
||||
const siblings = node.children
|
||||
|
||||
if (!child.position) {
|
||||
return
|
||||
}
|
||||
|
||||
if (
|
||||
index < 1 &&
|
||||
/* c8 ignore next */
|
||||
(!node.position || !node.position.start)
|
||||
) {
|
||||
patch(node)
|
||||
node.position.start = child.position.start
|
||||
}
|
||||
|
||||
if (index === siblings.length - 1 && (!node.position || !node.position.end)) {
|
||||
patch(node)
|
||||
node.position.end = child.position.end
|
||||
}
|
||||
})
|
||||
|
||||
// Add a `position` object when it does not yet exist on `node`.
|
||||
function patch(node) {
|
||||
if (!node.position) {
|
||||
node.position = {}
|
||||
}
|
||||
}
|
||||
12
node_modules/parse-latin/lib/plugin/remove-empty-nodes.js
generated
vendored
Normal file
12
node_modules/parse-latin/lib/plugin/remove-empty-nodes.js
generated
vendored
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
import {modifyChildren} from 'unist-util-modify-children'
|
||||
|
||||
// Remove empty children.
|
||||
export const removeEmptyNodes = modifyChildren(function (child, index, parent) {
|
||||
if ('children' in child && child.children.length === 0) {
|
||||
parent.children.splice(index, 1)
|
||||
|
||||
// Next, iterate over the node *now* at the current position (which was the
|
||||
// next node).
|
||||
return index
|
||||
}
|
||||
})
|
||||
42
node_modules/parse-latin/lib/tokenizer.js
generated
vendored
Normal file
42
node_modules/parse-latin/lib/tokenizer.js
generated
vendored
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
import {toString} from 'nlcst-to-string'
|
||||
|
||||
// Factory to create a tokenizer based on a given `expression`.
|
||||
export function tokenizerFactory(childType, expression) {
|
||||
return tokenizer
|
||||
|
||||
// A function that splits.
|
||||
function tokenizer(node) {
|
||||
const children = []
|
||||
const tokens = node.children
|
||||
const type = node.type
|
||||
let index = -1
|
||||
const lastIndex = tokens.length - 1
|
||||
let start = 0
|
||||
|
||||
while (++index < tokens.length) {
|
||||
if (
|
||||
index === lastIndex ||
|
||||
(tokens[index].type === childType &&
|
||||
expression.test(toString(tokens[index])))
|
||||
) {
|
||||
const first = tokens[start]
|
||||
const last = tokens[index]
|
||||
|
||||
const parent = {type, children: tokens.slice(start, index + 1)}
|
||||
|
||||
if (first.position && last.position) {
|
||||
parent.position = {
|
||||
start: first.position.start,
|
||||
end: last.position.end
|
||||
}
|
||||
}
|
||||
|
||||
children.push(parent)
|
||||
|
||||
start = index + 1
|
||||
}
|
||||
}
|
||||
|
||||
return children
|
||||
}
|
||||
}
|
||||
22
node_modules/parse-latin/license
generated
vendored
Normal file
22
node_modules/parse-latin/license
generated
vendored
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
(The MIT License)
|
||||
|
||||
Copyright (c) 2014 Titus Wormer <tituswormer@gmail.com>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
'Software'), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
80
node_modules/parse-latin/package.json
generated
vendored
Normal file
80
node_modules/parse-latin/package.json
generated
vendored
Normal file
|
|
@ -0,0 +1,80 @@
|
|||
{
|
||||
"name": "parse-latin",
|
||||
"version": "5.0.1",
|
||||
"description": "Latin-script (natural language) parser",
|
||||
"license": "MIT",
|
||||
"keywords": [
|
||||
"nlcst",
|
||||
"latin",
|
||||
"script",
|
||||
"natural",
|
||||
"language",
|
||||
"parser"
|
||||
],
|
||||
"repository": "wooorm/parse-latin",
|
||||
"bugs": "https://github.com/wooorm/parse-latin/issues",
|
||||
"funding": {
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/wooorm"
|
||||
},
|
||||
"author": "Titus Wormer <tituswormer@gmail.com> (https://wooorm.com)",
|
||||
"contributors": [
|
||||
"Titus Wormer <tituswormer@gmail.com> (https://wooorm.com)"
|
||||
],
|
||||
"sideEffects": false,
|
||||
"type": "module",
|
||||
"main": "index.js",
|
||||
"files": [
|
||||
"lib/",
|
||||
"index.js"
|
||||
],
|
||||
"dependencies": {
|
||||
"nlcst-to-string": "^3.0.0",
|
||||
"unist-util-modify-children": "^3.0.0",
|
||||
"unist-util-visit-children": "^2.0.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@unicode/unicode-13.0.0": "^1.0.0",
|
||||
"c8": "^7.0.0",
|
||||
"is-hidden": "^2.0.0",
|
||||
"negate": "^1.0.0",
|
||||
"nlcst-test": "^3.0.0",
|
||||
"nyc": "^15.0.0",
|
||||
"prettier": "^2.0.0",
|
||||
"regenerate": "^1.0.0",
|
||||
"remark-cli": "^11.0.0",
|
||||
"remark-preset-wooorm": "^9.0.0",
|
||||
"unist-util-remove-position": "^4.0.0",
|
||||
"vfile": "^5.0.0",
|
||||
"xo": "^0.52.0"
|
||||
},
|
||||
"scripts": {
|
||||
"prepack": "npm run generate && npm run format",
|
||||
"fixture": "node script/generate-fixture.js",
|
||||
"generate": "node script/build-expressions.js",
|
||||
"format": "remark . -qfo && prettier . -w --loglevel warn && xo --fix",
|
||||
"test-api": "node --conditions development test/index.js",
|
||||
"test-coverage": "c8 --check-coverage --100 --reporter lcov npm run test-api",
|
||||
"test": "npm run generate && npm run format && npm run test-coverage"
|
||||
},
|
||||
"prettier": {
|
||||
"tabWidth": 2,
|
||||
"useTabs": false,
|
||||
"singleQuote": true,
|
||||
"bracketSpacing": false,
|
||||
"semi": false,
|
||||
"trailingComma": "none"
|
||||
},
|
||||
"xo": {
|
||||
"prettier": true,
|
||||
"rules": {
|
||||
"max-depth": "off",
|
||||
"no-misleading-character-class": "off"
|
||||
}
|
||||
},
|
||||
"remarkConfig": {
|
||||
"plugins": [
|
||||
"preset-wooorm"
|
||||
]
|
||||
}
|
||||
}
|
||||
150
node_modules/parse-latin/readme.md
generated
vendored
Normal file
150
node_modules/parse-latin/readme.md
generated
vendored
Normal file
|
|
@ -0,0 +1,150 @@
|
|||
# parse-latin
|
||||
|
||||
[![Build][build-badge]][build]
|
||||
[![Coverage][coverage-badge]][coverage]
|
||||
[![Downloads][downloads-badge]][downloads]
|
||||
[![Size][size-badge]][size]
|
||||
[![Chat][chat-badge]][chat]
|
||||
|
||||
A Latin-script language parser for [**retext**][retext] producing **[nlcst][]**
|
||||
nodes.
|
||||
|
||||
Whether Old-English (“þā gewearþ þǣm hlāforde and þǣm hȳrigmannum wiþ ānum
|
||||
penninge”), Icelandic (“Hvað er að frétta”), French (“Où sont les toilettes?”),
|
||||
`parse-latin` does a good job at tokenizing it.
|
||||
|
||||
Note also that `parse-latin` does a decent job at tokenizing Latin-like scripts,
|
||||
Cyrillic (“Добро пожаловать!”), Georgian (“როგორა ხარ?”), Armenian (“Շատ հաճելի
|
||||
է”), and such.
|
||||
|
||||
## Install
|
||||
|
||||
This package is ESM only: Node 12+ is needed to use it and it must be `import`ed
|
||||
instead of `require`d.
|
||||
|
||||
[npm][]:
|
||||
|
||||
```sh
|
||||
npm install parse-latin
|
||||
```
|
||||
|
||||
## Use
|
||||
|
||||
```js
|
||||
import {inspect} from 'unist-util-inspect'
|
||||
import {ParseLatin} from 'parse-latin'
|
||||
|
||||
const tree = new ParseLatin().parse('A simple sentence.')
|
||||
|
||||
console.log(inspect(tree))
|
||||
```
|
||||
|
||||
Which, when inspecting, yields:
|
||||
|
||||
```txt
|
||||
RootNode[1] (1:1-1:19, 0-18)
|
||||
└─0 ParagraphNode[1] (1:1-1:19, 0-18)
|
||||
└─0 SentenceNode[6] (1:1-1:19, 0-18)
|
||||
├─0 WordNode[1] (1:1-1:2, 0-1)
|
||||
│ └─0 TextNode "A" (1:1-1:2, 0-1)
|
||||
├─1 WhiteSpaceNode " " (1:2-1:3, 1-2)
|
||||
├─2 WordNode[1] (1:3-1:9, 2-8)
|
||||
│ └─0 TextNode "simple" (1:3-1:9, 2-8)
|
||||
├─3 WhiteSpaceNode " " (1:9-1:10, 8-9)
|
||||
├─4 WordNode[1] (1:10-1:18, 9-17)
|
||||
│ └─0 TextNode "sentence" (1:10-1:18, 9-17)
|
||||
└─5 PunctuationNode "." (1:18-1:19, 17-18)
|
||||
```
|
||||
|
||||
## API
|
||||
|
||||
This package exports the following identifiers: `ParseLatin`.
|
||||
There is no default export.
|
||||
|
||||
### `ParseLatin(value)`
|
||||
|
||||
Exposes the functionality needed to tokenize natural Latin-script languages into
|
||||
a syntax tree.
|
||||
If `value` is passed here, it’s not needed to give it to `#parse()`.
|
||||
|
||||
#### `ParseLatin#tokenize(value)`
|
||||
|
||||
Tokenize `value` (`string`) into letters and numbers (words), white space, and
|
||||
everything else (punctuation).
|
||||
The returned nodes are a flat list without paragraphs or sentences.
|
||||
|
||||
###### Returns
|
||||
|
||||
[`Array.<Node>`][nlcst] — Nodes.
|
||||
|
||||
#### `ParseLatin#parse(value)`
|
||||
|
||||
Tokenize `value` (`string`) into an [NLCST][] tree.
|
||||
The returned node is a `RootNode` with in it paragraphs and sentences.
|
||||
|
||||
###### Returns
|
||||
|
||||
[`Node`][nlcst] — Root node.
|
||||
|
||||
## Algorithm
|
||||
|
||||
> Note: The easiest way to see **how parse-latin tokenizes and parses**, is by
|
||||
> using the [online parser demo][demo], which
|
||||
> shows the syntax tree corresponding to the typed text.
|
||||
|
||||
`parse-latin` splits text into white space, word, and punctuation tokens.
|
||||
`parse-latin` starts out with a pretty easy definition, one that most other
|
||||
tokenizers use:
|
||||
|
||||
* A “word” is one or more letter or number characters
|
||||
* A “white space” is one or more white space characters
|
||||
* A “punctuation” is one or more of anything else
|
||||
|
||||
Then, it manipulates and merges those tokens into a ([nlcst][]) syntax tree,
|
||||
adding sentences and paragraphs where needed.
|
||||
|
||||
* Some punctuation marks are part of the word they occur in, such as
|
||||
`non-profit`, `she’s`, `G.I.`, `11:00`, `N/A`, `&c`, `nineteenth- and…`
|
||||
* Some full-stops do not mark a sentence end, such as `1.`, `e.g.`, `id.`
|
||||
* Although full-stops, question marks, and exclamation marks (sometimes) end a
|
||||
sentence, that end might not occur directly after the mark, such as `.)`,
|
||||
`."`
|
||||
* And many more exceptions
|
||||
|
||||
## License
|
||||
|
||||
[MIT][license] © [Titus Wormer][author]
|
||||
|
||||
<!-- Definitions -->
|
||||
|
||||
[build-badge]: https://github.com/wooorm/parse-latin/workflows/main/badge.svg
|
||||
|
||||
[build]: https://github.com/wooorm/parse-latin/actions
|
||||
|
||||
[coverage-badge]: https://img.shields.io/codecov/c/github/wooorm/parse-latin.svg
|
||||
|
||||
[coverage]: https://codecov.io/github/wooorm/parse-latin
|
||||
|
||||
[downloads-badge]: https://img.shields.io/npm/dm/parse-latin.svg
|
||||
|
||||
[downloads]: https://www.npmjs.com/package/parse-latin
|
||||
|
||||
[size-badge]: https://img.shields.io/bundlephobia/minzip/parse-latin.svg
|
||||
|
||||
[size]: https://bundlephobia.com/result?p=parse-latin
|
||||
|
||||
[chat-badge]: https://img.shields.io/badge/join%20the%20community-on%20spectrum-7b16ff.svg
|
||||
|
||||
[chat]: https://spectrum.chat/unified/retext
|
||||
|
||||
[npm]: https://docs.npmjs.com/cli/install
|
||||
|
||||
[demo]: https://wooorm.com/parse-latin/
|
||||
|
||||
[license]: license
|
||||
|
||||
[author]: https://wooorm.com
|
||||
|
||||
[retext]: https://github.com/retextjs/retext
|
||||
|
||||
[nlcst]: https://github.com/syntax-tree/nlcst
|
||||
Loading…
Add table
Add a link
Reference in a new issue