🎉 initiate project *astro_rewrite*

This commit is contained in:
sindrekjelsrud 2023-07-19 21:31:30 +02:00
parent ffd4d5e86c
commit 2ba37bfbe3
8658 changed files with 2268794 additions and 2538 deletions

1
node_modules/parse-latin/index.js generated vendored Normal file
View file

@ -0,0 +1 @@
export {ParseLatin} from './lib/index.js'

19
node_modules/parse-latin/lib/expressions.js generated vendored Normal file

File diff suppressed because one or more lines are too long

424
node_modules/parse-latin/lib/index.js generated vendored Normal file
View file

@ -0,0 +1,424 @@
import {mergeInitialWordSymbol} from './plugin/merge-initial-word-symbol.js'
import {mergeFinalWordSymbol} from './plugin/merge-final-word-symbol.js'
import {mergeInnerWordSymbol} from './plugin/merge-inner-word-symbol.js'
import {mergeInnerWordSlash} from './plugin/merge-inner-word-slash.js'
import {mergeInitialisms} from './plugin/merge-initialisms.js'
import {mergeWords} from './plugin/merge-words.js'
import {patchPosition} from './plugin/patch-position.js'
import {mergeNonWordSentences} from './plugin/merge-non-word-sentences.js'
import {mergeAffixSymbol} from './plugin/merge-affix-symbol.js'
import {mergeInitialLowerCaseLetterSentences} from './plugin/merge-initial-lower-case-letter-sentences.js'
import {mergeInitialDigitSentences} from './plugin/merge-initial-digit-sentences.js'
import {mergePrefixExceptions} from './plugin/merge-prefix-exceptions.js'
import {mergeAffixExceptions} from './plugin/merge-affix-exceptions.js'
import {mergeRemainingFullStops} from './plugin/merge-remaining-full-stops.js'
import {makeInitialWhiteSpaceSiblings} from './plugin/make-initial-white-space-siblings.js'
import {makeFinalWhiteSpaceSiblings} from './plugin/make-final-white-space-siblings.js'
import {breakImplicitSentences} from './plugin/break-implicit-sentences.js'
import {removeEmptyNodes} from './plugin/remove-empty-nodes.js'
import {parserFactory} from './parser.js'
import {
newLine,
punctuation,
surrogates,
terminalMarker,
whiteSpace,
word
} from './expressions.js'
// PARSE LATIN
// Transform Latin-script natural language into an NLCST-tree.
export class ParseLatin {
constructor(doc, file) {
const value = file || doc
this.doc = value ? String(value) : null
}
// Run transform plugins for `key` on `nodes`.
run(key, nodes) {
const wareKey = key + 'Plugins'
const plugins = this[wareKey]
let index = -1
if (plugins) {
while (plugins[++index]) {
plugins[index](nodes)
}
}
return nodes
}
// Easy access to the document parser. This additionally supports retext-style
// invocation: where an instance is created for each file, and the file is given
// on construction.
parse(value) {
return this.tokenizeRoot(value || this.doc)
}
// Transform a `value` into a list of `NLCSTNode`s.
tokenize(value) {
const tokens = []
if (value === null || value === undefined) {
value = ''
} else if (value instanceof String) {
value = value.toString()
}
if (typeof value !== 'string') {
// Return the given nodes if this is either an empty array, or an array with
// a node as a first child.
if ('length' in value && (!value[0] || value[0].type)) {
return value
}
throw new Error(
"Illegal invocation: '" +
value +
"' is not a valid argument for 'ParseLatin'"
)
}
if (!value) {
return tokens
}
// Eat mechanism to use.
const eater = this.position ? eat : noPositionEat
let index = 0
let offset = 0
let line = 1
let column = 1
let previous = ''
let queue = ''
let left
let right
let character
while (index < value.length) {
character = value.charAt(index)
if (whiteSpace.test(character)) {
right = 'WhiteSpace'
} else if (punctuation.test(character)) {
right = 'Punctuation'
} else if (word.test(character)) {
right = 'Word'
} else {
right = 'Symbol'
}
tick.call(this)
previous = character
character = ''
left = right
right = null
index++
}
tick.call(this)
return tokens
// Check one character.
function tick() {
if (
left === right &&
(left === 'Word' ||
left === 'WhiteSpace' ||
character === previous ||
surrogates.test(character))
) {
queue += character
} else {
// Flush the previous queue.
if (queue) {
this['tokenize' + left](queue, eater)
}
queue = character
}
}
// Remove `subvalue` from `value`.
// Expects `subvalue` to be at the start from `value`, and applies no
// validation.
function eat(subvalue) {
const pos = position()
update(subvalue)
return apply
// Add the given arguments, add `position` to the returned node, and return
// the node.
function apply(...input) {
return pos(add(...input))
}
}
// Remove `subvalue` from `value`.
// Does not patch positional information.
function noPositionEat() {
return add
}
// Add mechanism.
function add(node, parent) {
if (parent) {
parent.children.push(node)
} else {
tokens.push(node)
}
return node
}
// Mark position and patch `node.position`.
function position() {
const before = now()
// Add the position to a node.
function patch(node) {
node.position = new Position(before)
return node
}
return patch
}
// Update line and column based on `value`.
function update(subvalue) {
let character = -1
let lastIndex = -1
offset += subvalue.length
while (++character < subvalue.length) {
if (subvalue.charAt(character) === '\n') {
lastIndex = character
line++
}
}
if (lastIndex < 0) {
column += subvalue.length
} else {
column = subvalue.length - lastIndex
}
}
// Store position information for a node.
function Position(start) {
this.start = start
this.end = now()
}
// Get the current position.
function now() {
return {line, column, offset}
}
}
}
// Default position.
ParseLatin.prototype.position = true
// Create text nodes.
ParseLatin.prototype.tokenizeSymbol = createTextFactory('Symbol')
ParseLatin.prototype.tokenizeWhiteSpace = createTextFactory('WhiteSpace')
ParseLatin.prototype.tokenizePunctuation = createTextFactory('Punctuation')
ParseLatin.prototype.tokenizeSource = createTextFactory('Source')
ParseLatin.prototype.tokenizeText = createTextFactory('Text')
// Inject `plugins` to modifiy the result of the method at `key` on the operated
// on context.
ParseLatin.prototype.use = useFactory(function (context, key, plugins) {
context[key] = context[key].concat(plugins)
})
// Inject `plugins` to modifiy the result of the method at `key` on the operated
// on context, before any other.
ParseLatin.prototype.useFirst = useFactory(function (context, key, plugins) {
context[key] = plugins.concat(context[key])
})
// PARENT NODES
//
// All these nodes are `pluggable`: they come with a `use` method which accepts
// a plugin (`function(NLCSTNode)`).
// Every time one of these methods are called, the plugin is invoked with the
// node, allowing for easy modification.
//
// In fact, the internal transformation from `tokenize` (a list of words, white
// space, punctuation, and symbols) to `tokenizeRoot` (an NLCST tree), is also
// implemented through this mechanism.
// Create a `WordNode` with its children set to a single `TextNode`, its value
// set to the given `value`.
pluggable(ParseLatin, 'tokenizeWord', function (value, eat) {
const add = (eat || noopEat)('')
const parent = {type: 'WordNode', children: []}
this.tokenizeText(value, eat, parent)
return add(parent)
})
// Create a `SentenceNode` with its children set to `Node`s, their values set
// to the tokenized given `value`.
//
// Unless plugins add new nodes, the sentence is populated by `WordNode`s,
// `SymbolNode`s, `PunctuationNode`s, and `WhiteSpaceNode`s.
pluggable(
ParseLatin,
'tokenizeSentence',
parserFactory({type: 'SentenceNode', tokenizer: 'tokenize'})
)
// Create a `ParagraphNode` with its children set to `Node`s, their values set
// to the tokenized given `value`.
//
// Unless plugins add new nodes, the paragraph is populated by `SentenceNode`s
// and `WhiteSpaceNode`s.
pluggable(
ParseLatin,
'tokenizeParagraph',
parserFactory({
type: 'ParagraphNode',
delimiter: terminalMarker,
delimiterType: 'PunctuationNode',
tokenizer: 'tokenizeSentence'
})
)
// Create a `RootNode` with its children set to `Node`s, their values set to the
// tokenized given `value`.
pluggable(
ParseLatin,
'tokenizeRoot',
parserFactory({
type: 'RootNode',
delimiter: newLine,
delimiterType: 'WhiteSpaceNode',
tokenizer: 'tokenizeParagraph'
})
)
// PLUGINS
ParseLatin.prototype.use('tokenizeSentence', [
mergeInitialWordSymbol,
mergeFinalWordSymbol,
mergeInnerWordSymbol,
mergeInnerWordSlash,
mergeInitialisms,
mergeWords,
patchPosition
])
ParseLatin.prototype.use('tokenizeParagraph', [
mergeNonWordSentences,
mergeAffixSymbol,
mergeInitialLowerCaseLetterSentences,
mergeInitialDigitSentences,
mergePrefixExceptions,
mergeAffixExceptions,
mergeRemainingFullStops,
makeInitialWhiteSpaceSiblings,
makeFinalWhiteSpaceSiblings,
breakImplicitSentences,
removeEmptyNodes,
patchPosition
])
ParseLatin.prototype.use('tokenizeRoot', [
makeInitialWhiteSpaceSiblings,
makeFinalWhiteSpaceSiblings,
removeEmptyNodes,
patchPosition
])
// TEXT NODES
// Factory to create a `Text`.
function createTextFactory(type) {
type += 'Node'
return createText
// Construct a `Text` from a bound `type`
function createText(value, eat, parent) {
if (value === null || value === undefined) {
value = ''
}
return (eat || noopEat)(value)({type, value: String(value)}, parent)
}
}
// Make a method “pluggable”.
function pluggable(Constructor, key, callback) {
// Set a pluggable version of `callback` on `Constructor`.
Constructor.prototype[key] = function (...input) {
return this.run(key, callback.apply(this, input))
}
}
// Factory to inject `plugins`. Takes `callback` for the actual inserting.
function useFactory(callback) {
return use
// Validate if `plugins` can be inserted.
// Invokes the bound `callback` to do the actual inserting.
function use(key, plugins) {
// Throw if the method is not pluggable.
if (!(key in this)) {
throw new Error(
'Illegal Invocation: Unsupported `key` for ' +
'`use(key, plugins)`. Make sure `key` is a ' +
'supported function'
)
}
// Fail silently when no plugins are given.
if (!plugins) {
return
}
const wareKey = key + 'Plugins'
// Make sure `plugins` is a list.
plugins = typeof plugins === 'function' ? [plugins] : plugins.concat()
// Make sure `wareKey` exists.
if (!this[wareKey]) {
this[wareKey] = []
}
// Invoke callback with the ware key and plugins.
callback(this, wareKey, plugins)
}
}
// Add mechanism used when text-tokenisers are called directly outside of the
// `tokenize` function.
function noopAdd(node, parent) {
if (parent) {
parent.children.push(node)
}
return node
}
// Eat and add mechanism without adding positional information, used when
// text-tokenisers are called directly outside of the `tokenize` function.
function noopEat() {
return noopAdd
}

18
node_modules/parse-latin/lib/parser.js generated vendored Normal file
View file

@ -0,0 +1,18 @@
import {tokenizerFactory} from './tokenizer.js'
// Construct a parser based on `options`.
export function parserFactory(options) {
const type = options.type
const tokenizerProperty = options.tokenizer
const delimiter = options.delimiter
const tokenize =
delimiter && tokenizerFactory(options.delimiterType, delimiter)
return parser
function parser(value) {
const children = this[tokenizerProperty](value)
return {type, children: tokenize ? tokenize(children) : children}
}
}

View file

@ -0,0 +1,51 @@
import {toString} from 'nlcst-to-string'
import {modifyChildren} from 'unist-util-modify-children'
// Break a sentence if a white space with more than one new-line is found.
export const breakImplicitSentences = modifyChildren(function (
child,
index,
parent
) {
if (child.type !== 'SentenceNode') {
return
}
const children = child.children
// Ignore first and last child.
let position = 0
while (++position < children.length - 1) {
const node = children[position]
if (
node.type !== 'WhiteSpaceNode' ||
toString(node).split(/\r\n|\r|\n/).length < 3
) {
continue
}
child.children = children.slice(0, position)
const insertion = {
type: 'SentenceNode',
children: children.slice(position + 1)
}
const tail = children[position - 1]
const head = children[position + 1]
parent.children.splice(index + 1, 0, node, insertion)
if (child.position && tail.position && head.position) {
const end = child.position.end
child.position.end = tail.position.end
insertion.position = {start: head.position.start, end}
}
return index + 1
}
})

View file

@ -0,0 +1,27 @@
import {modifyChildren} from 'unist-util-modify-children'
// Move white space ending a paragraph up, so they are the siblings of
// paragraphs.
export const makeFinalWhiteSpaceSiblings = modifyChildren(function (
child,
index,
parent
) {
const children = child.children
if (
children &&
children.length > 0 &&
children[children.length - 1].type === 'WhiteSpaceNode'
) {
parent.children.splice(index + 1, 0, child.children.pop())
const previous = children[children.length - 1]
if (previous && previous.position && child.position) {
child.position.end = previous.position.end
}
// Next, iterate over the current node again.
return index
}
})

View file

@ -0,0 +1,23 @@
import {visitChildren} from 'unist-util-visit-children'
// Move white space starting a sentence up, so they are the siblings of
// sentences.
export const makeInitialWhiteSpaceSiblings = visitChildren(function (
child,
index,
parent
) {
const children = child.children
if (
children &&
children.length > 0 &&
children[0].type === 'WhiteSpaceNode'
) {
parent.children.splice(index, 0, children.shift())
const next = children[0]
if (next && next.position && child.position) {
child.position.start = next.position.start
}
}
})

View file

@ -0,0 +1,47 @@
import {toString} from 'nlcst-to-string'
import {modifyChildren} from 'unist-util-modify-children'
// Merge a sentence into its previous sentence, when the sentence starts with a
// comma.
export const mergeAffixExceptions = modifyChildren(function (
child,
index,
parent
) {
const children = child.children
if (!children || children.length === 0 || index < 1) {
return
}
let position = -1
while (children[++position]) {
const node = children[position]
if (node.type === 'WordNode') {
return
}
if (node.type === 'SymbolNode' || node.type === 'PunctuationNode') {
const value = toString(node)
if (value !== ',' && value !== ';') {
return
}
const previousChild = parent.children[index - 1]
previousChild.children = previousChild.children.concat(children)
// Update position.
if (previousChild.position && child.position) {
previousChild.position.end = child.position.end
}
parent.children.splice(index, 1)
// Next, iterate over the node *now* at the current position.
return index
}
}
})

View file

@ -0,0 +1,38 @@
import {toString} from 'nlcst-to-string'
import {modifyChildren} from 'unist-util-modify-children'
// Closing or final punctuation, or terminal markers that should still be
// included in the previous sentence, even though they follow the sentences
// terminal marker.
import {affixSymbol} from '../expressions.js'
// Move certain punctuation following a terminal marker (thus in the next
// sentence) to the previous sentence.
export const mergeAffixSymbol = modifyChildren(function (child, index, parent) {
const children = child.children
if (children && children.length > 0 && index > 0) {
const first = children[0]
const second = children[1]
const previous = parent.children[index - 1]
if (
(first.type === 'SymbolNode' || first.type === 'PunctuationNode') &&
affixSymbol.test(toString(first))
) {
previous.children.push(children.shift())
// Update position.
if (first.position && previous.position) {
previous.position.end = first.position.end
}
if (second && second.position && child.position) {
child.position.start = second.position.start
}
// Next, iterate over the previous node again.
return index - 1
}
}
})

View file

@ -0,0 +1,40 @@
import {toString} from 'nlcst-to-string'
import {modifyChildren} from 'unist-util-modify-children'
// Merge certain punctuation marks into their preceding words.
export const mergeFinalWordSymbol = modifyChildren(function (
child,
index,
parent
) {
if (
index > 0 &&
(child.type === 'SymbolNode' || child.type === 'PunctuationNode') &&
toString(child) === '-'
) {
const children = parent.children
const previous = children[index - 1]
const next = children[index + 1]
if (
(!next || next.type !== 'WordNode') &&
previous &&
previous.type === 'WordNode'
) {
// Remove `child` from parent.
children.splice(index, 1)
// Add the punctuation mark at the end of the previous node.
previous.children.push(child)
// Update position.
if (previous.position && child.position) {
previous.position.end = child.position.end
}
// Next, iterate over the node *now* at the current position (which was
// the next node).
return index
}
}
})

View file

@ -0,0 +1,34 @@
import {toString} from 'nlcst-to-string'
import {modifyChildren} from 'unist-util-modify-children'
import {digitStart} from '../expressions.js'
// Merge a sentence into its previous sentence, when the sentence starts with a
// lower case letter.
export const mergeInitialDigitSentences = modifyChildren(function (
child,
index,
parent
) {
const children = child.children
const siblings = parent.children
const previous = siblings[index - 1]
const head = children[0]
if (
previous &&
head &&
head.type === 'WordNode' &&
digitStart.test(toString(head))
) {
previous.children = previous.children.concat(children)
siblings.splice(index, 1)
// Update position.
if (previous.position && child.position) {
previous.position.end = child.position.end
}
// Next, iterate over the node *now* at the current position.
return index
}
})

View file

@ -0,0 +1,48 @@
import {toString} from 'nlcst-to-string'
import {modifyChildren} from 'unist-util-modify-children'
// Initial lowercase letter.
import {lowerInitial} from '../expressions.js'
// Merge a sentence into its previous sentence, when the sentence starts with a
// lower case letter.
export const mergeInitialLowerCaseLetterSentences = modifyChildren(function (
child,
index,
parent
) {
const children = child.children
if (children && children.length > 0 && index > 0) {
let position = -1
while (children[++position]) {
const node = children[position]
if (node.type === 'WordNode') {
if (!lowerInitial.test(toString(node))) {
return
}
const siblings = parent.children
const previous = siblings[index - 1]
previous.children = previous.children.concat(children)
siblings.splice(index, 1)
// Update position.
if (previous.position && child.position) {
previous.position.end = child.position.end
}
// Next, iterate over the node *now* at the current position.
return index
}
if (node.type === 'SymbolNode' || node.type === 'PunctuationNode') {
return
}
}
}
})

View file

@ -0,0 +1,42 @@
import {toString} from 'nlcst-to-string'
import {modifyChildren} from 'unist-util-modify-children'
// Merge certain punctuation marks into their following words.
export const mergeInitialWordSymbol = modifyChildren(function (
child,
index,
parent
) {
if (
(child.type !== 'SymbolNode' && child.type !== 'PunctuationNode') ||
toString(child) !== '&'
) {
return
}
const children = parent.children
const next = children[index + 1]
// If either a previous word, or no following word, exists, exit early.
if (
(index > 0 && children[index - 1].type === 'WordNode') ||
!(next && next.type === 'WordNode')
) {
return
}
// Remove `child` from parent.
children.splice(index, 1)
// Add the punctuation mark at the start of the next node.
next.children.unshift(child)
// Update position.
if (next.position && child.position) {
next.position.start = child.position.start
}
// Next, iterate over the node at the previous position, as it's now adjacent
// to a following word.
return index - 1
})

View file

@ -0,0 +1,62 @@
import {toString} from 'nlcst-to-string'
import {modifyChildren} from 'unist-util-modify-children'
import {numerical} from '../expressions.js'
// Merge initialisms.
export const mergeInitialisms = modifyChildren(function (child, index, parent) {
if (index > 0 && toString(child) === '.') {
const siblings = parent.children
const previous = siblings[index - 1]
const children = previous.children
if (
previous.type === 'WordNode' &&
children &&
children.length !== 1 &&
children.length % 2 !== 0
) {
let position = children.length
let isAllDigits = true
while (children[--position]) {
const otherChild = children[position]
const value = toString(otherChild)
if (position % 2 === 0) {
// Initialisms consist of one character values.
if (value.length > 1) {
return
}
if (!numerical.test(value)) {
isAllDigits = false
}
} else if (value !== '.') {
if (position < children.length - 2) {
break
} else {
return
}
}
}
if (!isAllDigits) {
// Remove `child` from parent.
siblings.splice(index, 1)
// Add child to the previous children.
children.push(child)
// Update position.
if (previous.position && child.position) {
previous.position.end = child.position.end
}
// Next, iterate over the node *now* at the current position.
return index
}
}
}
})

View file

@ -0,0 +1,50 @@
import {toString} from 'nlcst-to-string'
import {modifyChildren} from 'unist-util-modify-children'
const slash = '/'
// Merge words joined by certain punctuation marks.
export const mergeInnerWordSlash = modifyChildren(function (
child,
index,
parent
) {
const siblings = parent.children
const previous = siblings[index - 1]
const next = siblings[index + 1]
if (
previous &&
previous.type === 'WordNode' &&
(child.type === 'SymbolNode' || child.type === 'PunctuationNode') &&
toString(child) === slash
) {
const previousValue = toString(previous)
let tail = child
let queue = [child]
let count = 1
let nextValue = ''
if (next && next.type === 'WordNode') {
nextValue = toString(next)
tail = next
queue = queue.concat(next.children)
count++
}
if (previousValue.length < 3 && (!nextValue || nextValue.length < 3)) {
// Add all found tokens to `prev`s children.
previous.children = previous.children.concat(queue)
siblings.splice(index, count)
// Update position.
if (previous.position && tail.position) {
previous.position.end = tail.position.end
}
// Next, iterate over the node *now* at the current position.
return index
}
}
})

View file

@ -0,0 +1,72 @@
import {toString} from 'nlcst-to-string'
import {modifyChildren} from 'unist-util-modify-children'
// Symbols part of surrounding words.
import {wordSymbolInner} from '../expressions.js'
// Merge words joined by certain punctuation marks.
export const mergeInnerWordSymbol = modifyChildren(function (
child,
index,
parent
) {
if (
index > 0 &&
(child.type === 'SymbolNode' || child.type === 'PunctuationNode')
) {
const siblings = parent.children
const previous = siblings[index - 1]
if (previous && previous.type === 'WordNode') {
let position = index - 1
let tokens = []
let queue = []
// - If a token which is neither word nor inner word symbol is found,
// the loop is broken
// - If an inner word symbol is found, its queued
// - If a word is found, its queued (and the queue stored and emptied)
while (siblings[++position]) {
const sibling = siblings[position]
if (sibling.type === 'WordNode') {
tokens = tokens.concat(queue, sibling.children)
queue = []
} else if (
(sibling.type === 'SymbolNode' ||
sibling.type === 'PunctuationNode') &&
wordSymbolInner.test(toString(sibling))
) {
queue.push(sibling)
} else {
break
}
}
if (tokens.length > 0) {
// If there is a queue, remove its length from `position`.
if (queue.length > 0) {
position -= queue.length
}
// Remove every (one or more) inner-word punctuation marks and children
// of words.
siblings.splice(index, position - index)
// Add all found tokens to `prev`s children.
previous.children = previous.children.concat(tokens)
const last = tokens[tokens.length - 1]
// Update position.
if (previous.position && last.position) {
previous.position.end = last.position.end
}
// Next, iterate over the node *now* at the current position.
return index
}
}
}
})

View file

@ -0,0 +1,50 @@
import {modifyChildren} from 'unist-util-modify-children'
// Merge a sentence into the following sentence, when the sentence does not
// contain word tokens.
export const mergeNonWordSentences = modifyChildren(function (
child,
index,
parent
) {
const children = child.children
let position = -1
while (children[++position]) {
if (children[position].type === 'WordNode') {
return
}
}
const previous = parent.children[index - 1]
if (previous) {
previous.children = previous.children.concat(children)
// Remove the child.
parent.children.splice(index, 1)
// Patch position.
if (previous.position && child.position) {
previous.position.end = child.position.end
}
// Next, iterate over the node *now* at the current position (which was the
// next node).
return index
}
const next = parent.children[index + 1]
if (next) {
next.children = children.concat(next.children)
// Patch position.
if (next.position && child.position) {
next.position.start = child.position.start
}
// Remove the child.
parent.children.splice(index, 1)
}
})

View file

@ -0,0 +1,72 @@
import {toString} from 'nlcst-to-string'
import {modifyChildren} from 'unist-util-modify-children'
// Full stop characters that should not be treated as terminal sentence markers:
// A case-insensitive abbreviation.
const abbreviationPrefix = new RegExp(
'^(' +
'[0-9]{1,3}|' +
'[a-z]|' +
// Common Latin Abbreviations:
// Based on: <https://en.wikipedia.org/wiki/List_of_Latin_abbreviations>.
// Where only the abbreviations written without joining full stops,
// but with a final full stop, were extracted.
//
// circa, capitulus, confer, compare, centum weight, eadem, (et) alii,
// et cetera, floruit, foliis, ibidem, idem, nemine && contradicente,
// opere && citato, (per) cent, (per) procurationem, (pro) tempore,
// sic erat scriptum, (et) sequentia, statim, videlicet. */
'al|ca|cap|cca|cent|cf|cit|con|cp|cwt|ead|etc|ff|' +
'fl|ibid|id|nem|op|pro|seq|sic|stat|tem|viz' +
')$'
)
// Merge a sentence into its next sentence, when the sentence ends with a
// certain word.
export const mergePrefixExceptions = modifyChildren(function (
child,
index,
parent
) {
const children = child.children
if (children && children.length > 1) {
const period = children[children.length - 1]
if (period && toString(period) === '.') {
const node = children[children.length - 2]
if (
node &&
node.type === 'WordNode' &&
abbreviationPrefix.test(toString(node).toLowerCase())
) {
// Merge period into abbreviation.
node.children.push(period)
children.pop()
// Update position.
if (period.position && node.position) {
node.position.end = period.position.end
}
// Merge sentences.
const next = parent.children[index + 1]
if (next) {
child.children = children.concat(next.children)
parent.children.splice(index + 1, 1)
// Update position.
if (next.position && child.position) {
child.position.end = next.position.end
}
// Next, iterate over the current node again.
return index - 1
}
}
}
}
})

View file

@ -0,0 +1,90 @@
import {toString} from 'nlcst-to-string'
import {visitChildren} from 'unist-util-visit-children'
// Full stop characters that should not be treated as terminal sentence markers:
// A case-insensitive abbreviation.
import {terminalMarker} from '../expressions.js'
// Merge non-terminal-marker full stops into the previous word (if available),
// or the next word (if available).
export const mergeRemainingFullStops = visitChildren(function (child) {
const children = child.children
let position = children.length
let hasFoundDelimiter = false
while (children[--position]) {
const grandchild = children[position]
if (
grandchild.type !== 'SymbolNode' &&
grandchild.type !== 'PunctuationNode'
) {
// This is a sentence without terminal marker, so we 'fool' the code to
// make it think we have found one.
if (grandchild.type === 'WordNode') {
hasFoundDelimiter = true
}
continue
}
// Exit when this token is not a terminal marker.
if (!terminalMarker.test(toString(grandchild))) {
continue
}
// Ignore the first terminal marker found (starting at the end), as it
// should not be merged.
if (!hasFoundDelimiter) {
hasFoundDelimiter = true
continue
}
// Only merge a single full stop.
if (toString(grandchild) !== '.') {
continue
}
const previous = children[position - 1]
const next = children[position + 1]
if (previous && previous.type === 'WordNode') {
const nextNext = children[position + 2]
// Continue when the full stop is followed by a space and another full
// stop, such as: `{.} .`
if (
next &&
nextNext &&
next.type === 'WhiteSpaceNode' &&
toString(nextNext) === '.'
) {
continue
}
// Remove `child` from parent.
children.splice(position, 1)
// Add the punctuation mark at the end of the previous node.
previous.children.push(grandchild)
// Update position.
if (grandchild.position && previous.position) {
previous.position.end = grandchild.position.end
}
position--
} else if (next && next.type === 'WordNode') {
// Remove `child` from parent.
children.splice(position, 1)
// Add the punctuation mark at the start of the next node.
next.children.unshift(grandchild)
if (grandchild.position && next.position) {
next.position.start = grandchild.position.start
}
}
}
})

28
node_modules/parse-latin/lib/plugin/merge-words.js generated vendored Normal file
View file

@ -0,0 +1,28 @@
import {modifyChildren} from 'unist-util-modify-children'
// Merge multiple words. This merges the children of adjacent words, something
// which should not occur naturally by parse-latin, but might happen when custom
// tokens were passed in.
export const mergeWords = modifyChildren(function (child, index, parent) {
const siblings = parent.children
if (child.type === 'WordNode') {
const next = siblings[index + 1]
if (next && next.type === 'WordNode') {
// Remove `next` from parent.
siblings.splice(index + 1, 1)
// Add the punctuation mark at the end of the previous node.
child.children = child.children.concat(next.children)
// Update position.
if (next.position && child.position) {
child.position.end = next.position.end
}
// Next, re-iterate the current node.
return index
}
}
})

31
node_modules/parse-latin/lib/plugin/patch-position.js generated vendored Normal file
View file

@ -0,0 +1,31 @@
import {visitChildren} from 'unist-util-visit-children'
// Patch the position on a parent node based on its first and last child.
export const patchPosition = visitChildren(function (child, index, node) {
const siblings = node.children
if (!child.position) {
return
}
if (
index < 1 &&
/* c8 ignore next */
(!node.position || !node.position.start)
) {
patch(node)
node.position.start = child.position.start
}
if (index === siblings.length - 1 && (!node.position || !node.position.end)) {
patch(node)
node.position.end = child.position.end
}
})
// Add a `position` object when it does not yet exist on `node`.
function patch(node) {
if (!node.position) {
node.position = {}
}
}

View file

@ -0,0 +1,12 @@
import {modifyChildren} from 'unist-util-modify-children'
// Remove empty children.
export const removeEmptyNodes = modifyChildren(function (child, index, parent) {
if ('children' in child && child.children.length === 0) {
parent.children.splice(index, 1)
// Next, iterate over the node *now* at the current position (which was the
// next node).
return index
}
})

42
node_modules/parse-latin/lib/tokenizer.js generated vendored Normal file
View file

@ -0,0 +1,42 @@
import {toString} from 'nlcst-to-string'
// Factory to create a tokenizer based on a given `expression`.
export function tokenizerFactory(childType, expression) {
return tokenizer
// A function that splits.
function tokenizer(node) {
const children = []
const tokens = node.children
const type = node.type
let index = -1
const lastIndex = tokens.length - 1
let start = 0
while (++index < tokens.length) {
if (
index === lastIndex ||
(tokens[index].type === childType &&
expression.test(toString(tokens[index])))
) {
const first = tokens[start]
const last = tokens[index]
const parent = {type, children: tokens.slice(start, index + 1)}
if (first.position && last.position) {
parent.position = {
start: first.position.start,
end: last.position.end
}
}
children.push(parent)
start = index + 1
}
}
return children
}
}

22
node_modules/parse-latin/license generated vendored Normal file
View file

@ -0,0 +1,22 @@
(The MIT License)
Copyright (c) 2014 Titus Wormer <tituswormer@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
'Software'), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

80
node_modules/parse-latin/package.json generated vendored Normal file
View file

@ -0,0 +1,80 @@
{
"name": "parse-latin",
"version": "5.0.1",
"description": "Latin-script (natural language) parser",
"license": "MIT",
"keywords": [
"nlcst",
"latin",
"script",
"natural",
"language",
"parser"
],
"repository": "wooorm/parse-latin",
"bugs": "https://github.com/wooorm/parse-latin/issues",
"funding": {
"type": "github",
"url": "https://github.com/sponsors/wooorm"
},
"author": "Titus Wormer <tituswormer@gmail.com> (https://wooorm.com)",
"contributors": [
"Titus Wormer <tituswormer@gmail.com> (https://wooorm.com)"
],
"sideEffects": false,
"type": "module",
"main": "index.js",
"files": [
"lib/",
"index.js"
],
"dependencies": {
"nlcst-to-string": "^3.0.0",
"unist-util-modify-children": "^3.0.0",
"unist-util-visit-children": "^2.0.0"
},
"devDependencies": {
"@unicode/unicode-13.0.0": "^1.0.0",
"c8": "^7.0.0",
"is-hidden": "^2.0.0",
"negate": "^1.0.0",
"nlcst-test": "^3.0.0",
"nyc": "^15.0.0",
"prettier": "^2.0.0",
"regenerate": "^1.0.0",
"remark-cli": "^11.0.0",
"remark-preset-wooorm": "^9.0.0",
"unist-util-remove-position": "^4.0.0",
"vfile": "^5.0.0",
"xo": "^0.52.0"
},
"scripts": {
"prepack": "npm run generate && npm run format",
"fixture": "node script/generate-fixture.js",
"generate": "node script/build-expressions.js",
"format": "remark . -qfo && prettier . -w --loglevel warn && xo --fix",
"test-api": "node --conditions development test/index.js",
"test-coverage": "c8 --check-coverage --100 --reporter lcov npm run test-api",
"test": "npm run generate && npm run format && npm run test-coverage"
},
"prettier": {
"tabWidth": 2,
"useTabs": false,
"singleQuote": true,
"bracketSpacing": false,
"semi": false,
"trailingComma": "none"
},
"xo": {
"prettier": true,
"rules": {
"max-depth": "off",
"no-misleading-character-class": "off"
}
},
"remarkConfig": {
"plugins": [
"preset-wooorm"
]
}
}

150
node_modules/parse-latin/readme.md generated vendored Normal file
View file

@ -0,0 +1,150 @@
# parse-latin
[![Build][build-badge]][build]
[![Coverage][coverage-badge]][coverage]
[![Downloads][downloads-badge]][downloads]
[![Size][size-badge]][size]
[![Chat][chat-badge]][chat]
A Latin-script language parser for [**retext**][retext] producing **[nlcst][]**
nodes.
Whether Old-English (“þā gewearþ þǣm hlāforde and þǣm hȳrigmannum wiþ ānum
penninge”), Icelandic (“Hvað er að frétta”), French (“Où sont les toilettes?”),
`parse-latin` does a good job at tokenizing it.
Note also that `parse-latin` does a decent job at tokenizing Latin-like scripts,
Cyrillic (“Добро пожаловать!”), Georgian (“როგორა ხარ?”), Armenian (“Շատ հաճելի
է”), and such.
## Install
This package is ESM only: Node 12+ is needed to use it and it must be `import`ed
instead of `require`d.
[npm][]:
```sh
npm install parse-latin
```
## Use
```js
import {inspect} from 'unist-util-inspect'
import {ParseLatin} from 'parse-latin'
const tree = new ParseLatin().parse('A simple sentence.')
console.log(inspect(tree))
```
Which, when inspecting, yields:
```txt
RootNode[1] (1:1-1:19, 0-18)
└─0 ParagraphNode[1] (1:1-1:19, 0-18)
└─0 SentenceNode[6] (1:1-1:19, 0-18)
├─0 WordNode[1] (1:1-1:2, 0-1)
│ └─0 TextNode "A" (1:1-1:2, 0-1)
├─1 WhiteSpaceNode " " (1:2-1:3, 1-2)
├─2 WordNode[1] (1:3-1:9, 2-8)
│ └─0 TextNode "simple" (1:3-1:9, 2-8)
├─3 WhiteSpaceNode " " (1:9-1:10, 8-9)
├─4 WordNode[1] (1:10-1:18, 9-17)
│ └─0 TextNode "sentence" (1:10-1:18, 9-17)
└─5 PunctuationNode "." (1:18-1:19, 17-18)
```
## API
This package exports the following identifiers: `ParseLatin`.
There is no default export.
### `ParseLatin(value)`
Exposes the functionality needed to tokenize natural Latin-script languages into
a syntax tree.
If `value` is passed here, its not needed to give it to `#parse()`.
#### `ParseLatin#tokenize(value)`
Tokenize `value` (`string`) into letters and numbers (words), white space, and
everything else (punctuation).
The returned nodes are a flat list without paragraphs or sentences.
###### Returns
[`Array.<Node>`][nlcst] — Nodes.
#### `ParseLatin#parse(value)`
Tokenize `value` (`string`) into an [NLCST][] tree.
The returned node is a `RootNode` with in it paragraphs and sentences.
###### Returns
[`Node`][nlcst] — Root node.
## Algorithm
> Note: The easiest way to see **how parse-latin tokenizes and parses**, is by
> using the [online parser demo][demo], which
> shows the syntax tree corresponding to the typed text.
`parse-latin` splits text into white space, word, and punctuation tokens.
`parse-latin` starts out with a pretty easy definition, one that most other
tokenizers use:
* A “word” is one or more letter or number characters
* A “white space” is one or more white space characters
* A “punctuation” is one or more of anything else
Then, it manipulates and merges those tokens into a ([nlcst][]) syntax tree,
adding sentences and paragraphs where needed.
* Some punctuation marks are part of the word they occur in, such as
`non-profit`, `shes`, `G.I.`, `11:00`, `N/A`, `&c`, `nineteenth- and…`
* Some full-stops do not mark a sentence end, such as `1.`, `e.g.`, `id.`
* Although full-stops, question marks, and exclamation marks (sometimes) end a
sentence, that end might not occur directly after the mark, such as `.)`,
`."`
* And many more exceptions
## License
[MIT][license] © [Titus Wormer][author]
<!-- Definitions -->
[build-badge]: https://github.com/wooorm/parse-latin/workflows/main/badge.svg
[build]: https://github.com/wooorm/parse-latin/actions
[coverage-badge]: https://img.shields.io/codecov/c/github/wooorm/parse-latin.svg
[coverage]: https://codecov.io/github/wooorm/parse-latin
[downloads-badge]: https://img.shields.io/npm/dm/parse-latin.svg
[downloads]: https://www.npmjs.com/package/parse-latin
[size-badge]: https://img.shields.io/bundlephobia/minzip/parse-latin.svg
[size]: https://bundlephobia.com/result?p=parse-latin
[chat-badge]: https://img.shields.io/badge/join%20the%20community-on%20spectrum-7b16ff.svg
[chat]: https://spectrum.chat/unified/retext
[npm]: https://docs.npmjs.com/cli/install
[demo]: https://wooorm.com/parse-latin/
[license]: license
[author]: https://wooorm.com
[retext]: https://github.com/retextjs/retext
[nlcst]: https://github.com/syntax-tree/nlcst