You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							346 lines
						
					
					
						
							9.9 KiB
						
					
					
				
			
		
		
	
	
							346 lines
						
					
					
						
							9.9 KiB
						
					
					
				'use strict'; | 
						|
 | 
						|
const generate = require('regjsgen').generate; | 
						|
const parse = require('regjsparser').parse; | 
						|
const regenerate = require('regenerate'); | 
						|
const unicodeMatchProperty = require('unicode-match-property-ecmascript'); | 
						|
const unicodeMatchPropertyValue = require('unicode-match-property-value-ecmascript'); | 
						|
const iuMappings = require('./data/iu-mappings.js'); | 
						|
const ESCAPE_SETS = require('./data/character-class-escape-sets.js'); | 
						|
 | 
						|
// Prepare a Regenerate set containing all code points, used for negative | 
						|
// character classes (if any). | 
						|
const UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF); | 
						|
// Without the `u` flag, the range stops at 0xFFFF. | 
						|
// https://mths.be/es6#sec-pattern-semantics | 
						|
const BMP_SET = regenerate().addRange(0x0, 0xFFFF); | 
						|
 | 
						|
// Prepare a Regenerate set containing all code points that are supposed to be | 
						|
// matched by `/./u`. https://mths.be/es6#sec-atom | 
						|
const DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points | 
						|
	.remove( | 
						|
		// minus `LineTerminator`s (https://mths.be/es6#sec-line-terminators): | 
						|
		0x000A, // Line Feed <LF> | 
						|
		0x000D, // Carriage Return <CR> | 
						|
		0x2028, // Line Separator <LS> | 
						|
		0x2029  // Paragraph Separator <PS> | 
						|
	); | 
						|
 | 
						|
const getCharacterClassEscapeSet = (character, unicode, ignoreCase) => { | 
						|
	if (unicode) { | 
						|
		if (ignoreCase) { | 
						|
			return ESCAPE_SETS.UNICODE_IGNORE_CASE.get(character); | 
						|
		} | 
						|
		return ESCAPE_SETS.UNICODE.get(character); | 
						|
	} | 
						|
	return ESCAPE_SETS.REGULAR.get(character); | 
						|
}; | 
						|
 | 
						|
const getUnicodeDotSet = (dotAll) => { | 
						|
	return dotAll ? UNICODE_SET : DOT_SET_UNICODE; | 
						|
}; | 
						|
 | 
						|
const getUnicodePropertyValueSet = (property, value) => { | 
						|
	const path = value ? | 
						|
		`${ property }/${ value }` : | 
						|
		`Binary_Property/${ property }`; | 
						|
	try { | 
						|
		return require(`regenerate-unicode-properties/${ path }.js`); | 
						|
	} catch (exception) { | 
						|
		throw new Error( | 
						|
			`Failed to recognize value \`${ value }\` for property ` + | 
						|
			`\`${ property }\`.` | 
						|
		); | 
						|
	} | 
						|
}; | 
						|
 | 
						|
const handleLoneUnicodePropertyNameOrValue = (value) => { | 
						|
	// It could be a `General_Category` value or a binary property. | 
						|
	// Note: `unicodeMatchPropertyValue` throws on invalid values. | 
						|
	try { | 
						|
		const property = 'General_Category'; | 
						|
		const category = unicodeMatchPropertyValue(property, value); | 
						|
		return getUnicodePropertyValueSet(property, category); | 
						|
	} catch (exception) {} | 
						|
	// It’s not a `General_Category` value, so check if it’s a binary | 
						|
	// property. Note: `unicodeMatchProperty` throws on invalid properties. | 
						|
	const property = unicodeMatchProperty(value); | 
						|
	return getUnicodePropertyValueSet(property); | 
						|
}; | 
						|
 | 
						|
const getUnicodePropertyEscapeSet = (value, isNegative) => { | 
						|
	const parts = value.split('='); | 
						|
	const firstPart = parts[0]; | 
						|
	let set; | 
						|
	if (parts.length == 1) { | 
						|
		set = handleLoneUnicodePropertyNameOrValue(firstPart); | 
						|
	} else { | 
						|
		// The pattern consists of two parts, i.e. `Property=Value`. | 
						|
		const property = unicodeMatchProperty(firstPart); | 
						|
		const value = unicodeMatchPropertyValue(property, parts[1]); | 
						|
		set = getUnicodePropertyValueSet(property, value); | 
						|
	} | 
						|
	if (isNegative) { | 
						|
		return UNICODE_SET.clone().remove(set); | 
						|
	} | 
						|
	return set.clone(); | 
						|
}; | 
						|
 | 
						|
// Given a range of code points, add any case-folded code points in that range | 
						|
// to a set. | 
						|
regenerate.prototype.iuAddRange = function(min, max) { | 
						|
	const $this = this; | 
						|
	do { | 
						|
		const folded = caseFold(min); | 
						|
		if (folded) { | 
						|
			$this.add(folded); | 
						|
		} | 
						|
	} while (++min <= max); | 
						|
	return $this; | 
						|
}; | 
						|
 | 
						|
const update = (item, pattern) => { | 
						|
	let tree = parse(pattern, config.useUnicodeFlag ? 'u' : ''); | 
						|
	switch (tree.type) { | 
						|
		case 'characterClass': | 
						|
		case 'group': | 
						|
		case 'value': | 
						|
			// No wrapping needed. | 
						|
			break; | 
						|
		default: | 
						|
			// Wrap the pattern in a non-capturing group. | 
						|
			tree = wrap(tree, pattern); | 
						|
	} | 
						|
	Object.assign(item, tree); | 
						|
}; | 
						|
 | 
						|
const wrap = (tree, pattern) => { | 
						|
	// Wrap the pattern in a non-capturing group. | 
						|
	return { | 
						|
		'type': 'group', | 
						|
		'behavior': 'ignore', | 
						|
		'body': [tree], | 
						|
		'raw': `(?:${ pattern })` | 
						|
	}; | 
						|
}; | 
						|
 | 
						|
const caseFold = (codePoint) => { | 
						|
	return iuMappings.get(codePoint) || false; | 
						|
}; | 
						|
 | 
						|
const processCharacterClass = (characterClassItem, regenerateOptions) => { | 
						|
	const set = regenerate(); | 
						|
	for (const item of characterClassItem.body) { | 
						|
		switch (item.type) { | 
						|
			case 'value': | 
						|
				set.add(item.codePoint); | 
						|
				if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) { | 
						|
					const folded = caseFold(item.codePoint); | 
						|
					if (folded) { | 
						|
						set.add(folded); | 
						|
					} | 
						|
				} | 
						|
				break; | 
						|
			case 'characterClassRange': | 
						|
				const min = item.min.codePoint; | 
						|
				const max = item.max.codePoint; | 
						|
				set.addRange(min, max); | 
						|
				if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) { | 
						|
					set.iuAddRange(min, max); | 
						|
				} | 
						|
				break; | 
						|
			case 'characterClassEscape': | 
						|
				set.add(getCharacterClassEscapeSet( | 
						|
					item.value, | 
						|
					config.unicode, | 
						|
					config.ignoreCase | 
						|
				)); | 
						|
				break; | 
						|
			case 'unicodePropertyEscape': | 
						|
				set.add(getUnicodePropertyEscapeSet(item.value, item.negative)); | 
						|
				break; | 
						|
			// The `default` clause is only here as a safeguard; it should never be | 
						|
			// reached. Code coverage tools should ignore it. | 
						|
			/* istanbul ignore next */ | 
						|
			default: | 
						|
				throw new Error(`Unknown term type: ${ item.type }`); | 
						|
		} | 
						|
	} | 
						|
	if (characterClassItem.negative) { | 
						|
		update(characterClassItem, `(?!${set.toString(regenerateOptions)})[\\s\\S]`) | 
						|
	} else { | 
						|
		update(characterClassItem, set.toString(regenerateOptions)); | 
						|
	} | 
						|
	return characterClassItem; | 
						|
}; | 
						|
 | 
						|
const updateNamedReference = (item, index) => { | 
						|
	delete item.name; | 
						|
	item.matchIndex = index; | 
						|
}; | 
						|
 | 
						|
const assertNoUnmatchedReferences = (groups) => { | 
						|
	const unmatchedReferencesNames = Object.keys(groups.unmatchedReferences); | 
						|
	if (unmatchedReferencesNames.length > 0) { | 
						|
		throw new Error(`Unknown group names: ${unmatchedReferencesNames}`); | 
						|
	} | 
						|
}; | 
						|
 | 
						|
const processTerm = (item, regenerateOptions, groups) => { | 
						|
	switch (item.type) { | 
						|
		case 'dot': | 
						|
			if (config.useDotAllFlag) { | 
						|
				break; | 
						|
			} else if (config.unicode) { | 
						|
				update( | 
						|
					item, | 
						|
					getUnicodeDotSet(config.dotAll).toString(regenerateOptions) | 
						|
				); | 
						|
			} else if (config.dotAll) { | 
						|
				// TODO: consider changing this at the regenerate level. | 
						|
				update(item, '[\\s\\S]'); | 
						|
			} | 
						|
			break; | 
						|
		case 'characterClass': | 
						|
			item = processCharacterClass(item, regenerateOptions); | 
						|
			break; | 
						|
		case 'unicodePropertyEscape': | 
						|
			if (config.unicodePropertyEscape) { | 
						|
				update( | 
						|
					item, | 
						|
					getUnicodePropertyEscapeSet(item.value, item.negative) | 
						|
						.toString(regenerateOptions) | 
						|
				); | 
						|
			} | 
						|
			break; | 
						|
		case 'characterClassEscape': | 
						|
			update( | 
						|
				item, | 
						|
				getCharacterClassEscapeSet( | 
						|
					item.value, | 
						|
					config.unicode, | 
						|
					config.ignoreCase | 
						|
				).toString(regenerateOptions) | 
						|
			); | 
						|
			break; | 
						|
		case 'group': | 
						|
			if (item.behavior == 'normal') { | 
						|
				groups.lastIndex++; | 
						|
			} | 
						|
			if (item.name && config.namedGroup) { | 
						|
				const name = item.name.value; | 
						|
 | 
						|
				if (groups.names[name]) { | 
						|
					throw new Error( | 
						|
						`Multiple groups with the same name (${ name }) are not allowed.` | 
						|
					); | 
						|
				} | 
						|
 | 
						|
				const index = groups.lastIndex; | 
						|
				delete item.name; | 
						|
 | 
						|
				groups.names[name] = index; | 
						|
				if (groups.onNamedGroup) { | 
						|
					groups.onNamedGroup.call(null, name, index); | 
						|
				} | 
						|
 | 
						|
				if (groups.unmatchedReferences[name]) { | 
						|
					groups.unmatchedReferences[name].forEach(reference => { | 
						|
						updateNamedReference(reference, index); | 
						|
					}); | 
						|
					delete groups.unmatchedReferences[name]; | 
						|
				} | 
						|
			} | 
						|
			/* falls through */ | 
						|
		case 'alternative': | 
						|
		case 'disjunction': | 
						|
		case 'quantifier': | 
						|
			item.body = item.body.map(term => { | 
						|
				return processTerm(term, regenerateOptions, groups); | 
						|
			}); | 
						|
			break; | 
						|
		case 'value': | 
						|
			const codePoint = item.codePoint; | 
						|
			const set = regenerate(codePoint); | 
						|
			if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) { | 
						|
				const folded = caseFold(codePoint); | 
						|
				if (folded) { | 
						|
					set.add(folded); | 
						|
				} | 
						|
			} | 
						|
			update(item, set.toString(regenerateOptions)); | 
						|
			break; | 
						|
		case 'reference': | 
						|
			if (item.name) { | 
						|
				const name = item.name.value; | 
						|
				const index = groups.names[name]; | 
						|
				if (index) { | 
						|
					updateNamedReference(item, index); | 
						|
					break; | 
						|
				} | 
						|
 | 
						|
				if (!groups.unmatchedReferences[name]) { | 
						|
					groups.unmatchedReferences[name] = []; | 
						|
				} | 
						|
				// Keep track of references used before the corresponding group. | 
						|
				groups.unmatchedReferences[name].push(item); | 
						|
			} | 
						|
			break; | 
						|
		case 'anchor': | 
						|
		case 'empty': | 
						|
		case 'group': | 
						|
			// Nothing to do here. | 
						|
			break; | 
						|
		// The `default` clause is only here as a safeguard; it should never be | 
						|
		// reached. Code coverage tools should ignore it. | 
						|
		/* istanbul ignore next */ | 
						|
		default: | 
						|
			throw new Error(`Unknown term type: ${ item.type }`); | 
						|
	} | 
						|
	return item; | 
						|
}; | 
						|
 | 
						|
const config = { | 
						|
	'ignoreCase': false, | 
						|
	'unicode': false, | 
						|
	'dotAll': false, | 
						|
	'useDotAllFlag': false, | 
						|
	'useUnicodeFlag': false, | 
						|
	'unicodePropertyEscape': false, | 
						|
	'namedGroup': false | 
						|
}; | 
						|
const rewritePattern = (pattern, flags, options) => { | 
						|
	config.unicode = flags && flags.includes('u'); | 
						|
	const regjsparserFeatures = { | 
						|
		'unicodePropertyEscape': config.unicode, | 
						|
		'namedGroups': true, | 
						|
		'lookbehind': options && options.lookbehind | 
						|
	}; | 
						|
	config.ignoreCase = flags && flags.includes('i'); | 
						|
	const supportDotAllFlag = options && options.dotAllFlag; | 
						|
	config.dotAll = supportDotAllFlag && flags && flags.includes('s'); | 
						|
	config.namedGroup = options && options.namedGroup; | 
						|
	config.useDotAllFlag = options && options.useDotAllFlag; | 
						|
	config.useUnicodeFlag = options && options.useUnicodeFlag; | 
						|
	config.unicodePropertyEscape = options && options.unicodePropertyEscape; | 
						|
	if (supportDotAllFlag && config.useDotAllFlag) { | 
						|
		throw new Error('`useDotAllFlag` and `dotAllFlag` cannot both be true!'); | 
						|
	} | 
						|
	const regenerateOptions = { | 
						|
		'hasUnicodeFlag': config.useUnicodeFlag, | 
						|
		'bmpOnly': !config.unicode | 
						|
	}; | 
						|
	const groups = { | 
						|
		'onNamedGroup': options && options.onNamedGroup, | 
						|
		'lastIndex': 0, | 
						|
		'names': Object.create(null), // { [name]: index } | 
						|
		'unmatchedReferences': Object.create(null) // { [name]: Array<reference> } | 
						|
	}; | 
						|
	const tree = parse(pattern, flags, regjsparserFeatures); | 
						|
	// Note: `processTerm` mutates `tree` and `groups`. | 
						|
	processTerm(tree, regenerateOptions, groups); | 
						|
	assertNoUnmatchedReferences(groups); | 
						|
	return generate(tree); | 
						|
}; | 
						|
 | 
						|
module.exports = rewritePattern;
 | 
						|
 |