Skip to content

Instantly share code, notes, and snippets.

@hansputera
Created March 17, 2025 18:03
Show Gist options
  • Save hansputera/d342f6b56881146fb81030e40a670d8e to your computer and use it in GitHub Desktop.
Save hansputera/d342f6b56881146fb81030e40a670d8e to your computer and use it in GitHub Desktop.
Simple asf HTML Parser (just unopiniated parser), what I do?
import * as htmlparser2 from 'htmlparser2';
import type {
ParseChildNode,
ParserExtractFunc,
ParserFunctions,
ParserOptions,
} from '@/types/parser.js';
/**
* @class Parser
*/
export class Parser<T extends object> {
protected functions: Record<
keyof T,
{
options: ParserOptions;
funcs: ParserFunctions;
}
> = {} as Record<
keyof T,
{
options: ParserOptions;
funcs: ParserFunctions;
}
>;
/**
* @constructor
* @param options Parser options
*/
constructor(protected readonly options: htmlparser2.Options) {}
public addFunction(name: keyof T, funcs: ParserFunctions, options: ParserOptions) {
if (Reflect.has(this.functions, name)) {
throw new Error(`Function ${String(name)} already exists`);
}
Reflect.set(this.functions, name, {
funcs,
options,
});
}
public removeFunction(name: keyof T) {
if (!Reflect.has(this.functions, name)) {
throw new Error(`Function ${String(name)} does not exist`);
}
Reflect.deleteProperty(this.functions, name);
}
public async execute(html: string): Promise<T> {
const entries = Object.entries<{
options: ParserOptions;
funcs: ParserFunctions;
}>(this.functions);
if (!entries.length) {
throw new Error('No functions added');
}
let node: ParseChildNode = {
currentFunc: '',
currentTag: '',
currentCount: 0,
prev: undefined,
};
return new Promise((resolve) => {
const results: T = {} as T;
const parser = new htmlparser2.Parser({
...this.options,
async onopentag(name, attribs) {
for (const entry of entries) {
if (await entry[1].funcs.detect(name, attribs)) {
if (node.currentTag.length) {
Reflect.set(node, 'prev', node);
}
Reflect.set(node, 'currentTag', name);
Reflect.set(node, 'currentFunc', entry[0]);
if (entry[1].options.pickAttributesAsValues.length) {
if (!Reflect.has(results, entry[0])) {
Reflect.set(results, entry[0], {});
}
for (const [keyInAttrib, keyInVal] of entry[1].options
.pickAttributesAsValues) {
const prevValues = Reflect.get(
results,
entry[0],
) as T[keyof T] as Record<string, string>;
Reflect.set(prevValues, keyInVal, attribs[keyInAttrib]); // set the updated value first
Reflect.set(results, entry[0], prevValues); // update data entry in results var
}
}
}
}
},
async ontext(data) {
console.log(node);
if (node.currentTag.length) {
const entry = Reflect.get(this.functions, node.currentFunc);
if (entry) {
let currentResults = Reflect.get<T, keyof T>(
results,
node.currentFunc as keyof T,
) as T[keyof T];
const result = await entry.funcs.extract(data);
if (
Array.isArray(currentResults) &&
entry.options.maxCollectionCount > 1
) {
currentResults.push(result);
} else if (
!Array.isArray(currentResults) &&
entry.options.maxCollectionCount > 1
) {
currentResults = [currentResults, ...result] as T[keyof T];
} else {
currentResults = result;
}
Reflect.set(results, node.currentFunc, currentResults);
}
}
},
onclosetag(name) {
if (node.currentTag === name) {
if (node.prev) {
node = node.prev;
}
Reflect.set(node, 'currentCount', node.currentCount + 1);
}
},
onend() {
return resolve(results);
},
});
parser.write(html);
parser.end();
});
}
}
export const makeParserExtractFunc = <D>(func: ParserExtractFunc<D>) => func;
@hansputera
Copy link
Author

Here's the types

export type ParserOptions = {
	/**
	 * @description How much item want to collect before stop
	 */
	maxCollectionCount: number;

	/**
	 * @description What items in attributes want to pick as values?
	 * @example [["class", "cls"], ["href", "url"]] -> {"cls": "...", "url": "..."}
	 */
	pickAttributesAsValues: [string, string][];
};

export type ParserDetectFunc = (
	tagName: string,
	tagAttribs: Record<string, string>,
) => Promise<boolean> | boolean;
export type ParserExtractFunc<D> = (text: string) => Promise<D> | D;

export type ParserFunctions = {
	detect: ParserDetectFunc;
	extract: ParserExtractFunc;
};

export type ParseChildNode = {
	currentTag: string;
	currentFunc: string;
	currentCount: number;
	prev?: ParseChildNode;
};

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment