Skip to content

Instantly share code, notes, and snippets.

@hexianga
Last active June 10, 2023 08:48
Show Gist options
  • Save hexianga/9f1ce67fd2325a628e027f2fee15b624 to your computer and use it in GitHub Desktop.
Save hexianga/9f1ce67fd2325a628e027f2fee15b624 to your computer and use it in GitHub Desktop.
import axios from 'axios';
import * as cheerio from 'cheerio';
import * as iconvLite from 'iconv-lite';
export const fetchHtml = async (webUrl: string) => {
try {
const { data: htmlBuffer, headers } = await axios.get(webUrl, {
responseType: 'arraybuffer',
headers: {
'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
},
});
let charset: string | undefined = getCharset(headers['content-type']);
let html = iconvLite.decode(Buffer.from(htmlBuffer), charset || 'utf-8');
let $ = cheerio.load(html, null, false);
if (!charset) {
charset = $('meta[charset]').attr('charset');
if (!charset) {
charset = getCharset(
$('meta[http-equiv="Content-Type"]').attr('content'),
);
}
if (charset) {
charset = charset.toLowerCase();
}
if (charset && charset !== 'utf8' && charset !== 'utf-8') {
html = iconvLite.decode(Buffer.from(htmlBuffer), charset);
$ = cheerio.load(html, null, false);
}
}
return $;
} catch (error) {
return;
}
};
const getCharset = (str?: string) => {
if (!str) return;
return /charset=(.*)/i.exec(str.replace(/;/g, ''))?.[1];
};
import * as path from 'path';
import { fetchHtml } from './fetch-html';
export interface WebMetaInfo {
title?: string;
description?: string;
keyword?: string;
favicon?: string;
image?: string;
url: string;
}
export const fetchWebMetaInfo = async (
webUrl: string,
): Promise<WebMetaInfo | undefined> => {
try {
const $ = await fetchHtml(webUrl);
if (!$) return;
const title = $('title').text();
let description = $(
'meta[name*="description"], meta[property*="og:description"]',
).attr('content');
// https://ogp.me/
// og 对 seo 的影响:https://cloud.tencent.com/developer/article/1617791
const image = $(
'meta[property*="og:image"], meta[property*="og:image:url"], meta[property*="og:image:secure_url"]',
).attr('content');
const keyword = $('meta[name*="keywords"]').attr('content');
const url = new URL(webUrl);
let favicon = $('link[rel*="icon"]').attr('href');
if (favicon) {
if (!/^((https?:\/\/)|(data:))/.test(favicon)) {
if (favicon.startsWith('/')) {
favicon = path.join(url.origin, favicon);
}
if (favicon.startsWith('.')) {
favicon = path.join(url.origin, url.pathname, '../', favicon);
}
}
} else {
favicon = url.origin + '/favicon.ico';
}
if (!description) {
description = $('body').text().slice(0, 100);
}
return {
url: webUrl,
title,
description,
keyword,
favicon,
image,
};
} catch (error) {
console.log(error);
return;
}
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment