This repository has been archived on 2025-09-04. You can view files and clone it, but cannot push or open issues or pull requests.
animevibe-cli/scrape.js

144 lines
4.4 KiB
JavaScript

const fetch = require('node-fetch');
const cheerio = require('cheerio');
function makeUrlScrape(scrape, defaultUrl) {
return async (url=defaultUrl) => {
const res = await fetch(url);
const $ = cheerio.load(await res.text());
return { url, ...scrape($, { url }) };
};
}
const scrapeAnimevibeDownloadPage = makeUrlScrape($ => {
const entries = $('.alert > p').map((_, p) => {
const key = $(p).text().split(' ')[0].toLowerCase();
const href = $(p).next('a').attr('href');
return [[key, href]];
}).get();
return Object.fromEntries(entries);
});
const scrapeVidCDNDownloadPage = makeUrlScrape($ => {
const downloads = scrapeDownloads($);
const info = scrapeInfo($);
return { ...info, downloads };
});
function scrapeDownloads($) {
return $('div.mirror_link').first()
.find('.dowload > a').map((_, a) => {
const info = parseDownload($(a).text());
const url = $(a).attr('href');
return { ...info, url };
}).get();
}
function scrapeInfo($) {
const keys = ['filename', 'filesize', 'duration', 'resolution'];
const entries = $('.sumer_l > ul > li > span').map((i, span) => {
const key = keys[i];
const value = $(span).text();
return [[key, value]];
}).get();
return Object.fromEntries(entries);
}
function parseDownload(text) {
const regex = /Download\s+\((?<quality>[\w\d]+)P\s+-\s+(?<format>[\w\d]+)\)/;
const { groups } = regex.exec(text);
return groups;
}
const scrapeAnimevibeSeriesPage = makeUrlScrape(($, { url }) => {
const seriesId = url.split('/')[4];
const downloadId = scrapeDownloadId($);
const currentEpisodeNumber = scrapeEpisodeNumber($);
const infoDiv = $('#blogShort');
const englishTitle = infoDiv.find('h5.title-av-search-res').text();
const thumbnailUrl = infoDiv.find('#thumb-rsz').attr('data-bg');
const info = scrapeInfoDiv(infoDiv, $);
const episodeCount = parseInt(info['Number of Episodes'].split(' ', 1));
const views = parseInt(info['Views'].split(' ', 1));
const summary = info['Summary'];
const gernes = info['Genre'].split(', ');
const otherTitles = parseTitles(info['Alternate Titles']);
const myAnimeListScore = parseFloat(info['[MyAnimeList] Score']);
const type = info['Type'];
const status = info['Status'];
const dates = parseDates(info['Date']);
return {
seriesId,
downloadId,
currentEpisodeNumber,
episodeCount,
titles: { english: englishTitle, ...otherTitles },
thumbnailUrl,
type,
...dates,
status,
gernes,
summary,
myAnimeListScore,
views,
};
});
function scrapeEpisodeNumber($) {
const button = $('.current-episode-button');
return button.length ? parseInt(button.text()) : null;
}
function scrapeDownloadId($) {
const href = $('.download-av > a:nth-child(1)').attr('href');
return new URL(href).searchParams.get('id');
}
function parseTitles(text) {
const regex = /(?<nativeJapanese>[^,]*), (?<japanese>[^,]*), (?<jsonParsableAbbreviations>\[.*\])/;
const { groups } = regex.exec(text);
const json = groups.jsonParsableAbbreviations.replace(/'/g, '"');
return {
nativeJapanese: groups.nativeJapanese,
japanese: groups.japanese,
abbreviations: JSON.parse(json),
};
}
function parseDates(text) {
const regex = /(?<releaseDate>\w{3} \d{1,2}, \d{4})( to (\?|(?<finishedDate>\w{3} \d{1,2}, \d{4})))?/;
const { groups } = regex.exec(text);
const releaseDate = groups.releaseDate;
const finishedDate = groups.finishedDate || null;
return { releaseDate, finishedDate };
}
function scrapeInfoDiv(infoDiv, $) {
const entries = infoDiv.find('h6.excerpt-anime-info').map((_, h6) => {
const [key, value] = $(h6).text().split(/(^[^:]+): /).slice(1);
return [[key, value]];
}).get();
return Object.fromEntries(entries);
}
async function scrapeMp4UploadVideoFileUrl(url) {
const id = url.split('/').pop();
const res = await fetch(url, {
method: 'POST',
body: new URLSearchParams({
id, op: 'download2'
}),
redirect: 'manual',
});
return res.headers.get('location');
}
module.exports = {
scrapeVidCDNDownloadPage,
scrapeAnimevibeSeriesPage,
scrapeAnimevibeDownloadPage,
scrapeMp4UploadVideoFileUrl,
};