Skip to content

Instantly share code, notes, and snippets.

@alphonse92
Last active August 21, 2020 02:22
Show Gist options
  • Select an option

  • Save alphonse92/468e11c83484ffadfdf2ac23fee90e8d to your computer and use it in GitHub Desktop.

Select an option

Save alphonse92/468e11c83484ffadfdf2ac23fee90e8d to your computer and use it in GitHub Desktop.
Script to scrap the results of a search of google images.
// Keywords: Google chrome, chromium, puppeter, selenium, web scrapping
// You can use this script to download all google images
// as far Google allows
// 1. Go to google images
// 2. Do a search. I.e : casa
// 3. Copy the code below and wait until it finish
// To see all the urls of all the images visit this: window.__GoogleImagesFullRessImages__
function scrollToTheEndOrToMax(opts = {}) {
const {
onFinish,
onError,
scrollDownClass = "mye4qd",
scrollEndClass = "OuJzKb Yu2Dnd",
interval = 100,
maxIntervals = 0,
} = opts;
let intervalId;
let currentInterval = 0;
// function that clear the interval
function clearScrollInterval() {
if (intervalId) {
clearInterval(intervalId);
onFinish && onFinish();
}
}
// function that scrolls down to the page
function scrollDown() {
try {
// return if the limit of attemps to reach the bottom reach the max of intervals value
if (maxIntervals > 0 && currentInterval > maxIntervals) return clearScrollInterval();
// Otherwise Scroll the body height to reach the end
window.scrollTo(0, document.body.scrollHeight);
// Retreive the elements to check if the bottom was reached
const btnToScrollDown = document.getElementsByClassName(scrollDownClass)[0]
const labelEndOfThePage = document.getElementsByClassName(scrollEndClass)[0]
// If this element is showed, then return and clear the interval
if (labelEndOfThePage && window.getComputedStyle(labelEndOfThePage, null).display !== "none") {
return clearScrollInterval();
} else if (btnToScrollDown) {
// otherwise click on the load more button
btnToScrollDown.click();
}
// add one to current interval
currentInterval++;
} catch (e) {
// handle errors if exist.
onError && onError();
clearScrollInterval()
}
}
intervalId = setInterval(scrollDown, interval)
}
function getFunctionToExtractFullResImages(opts = {},) {
const {
onNext,
onFinish,
onError,
maxImageToProcess,
interval = 1000,
singleImageClassName = "PNCib MSM1fd BUooTd",
selectedImageClassName = "n3VNCb",
} = opts;
return function () {
const arrayOfUlrs = [];
const arrayOfImageElements = Array.from(document.getElementsByClassName(singleImageClassName))
const limit = maxImageToProcess
? maxImageToProcess + 1
: arrayOfImageElements.length;
const arrayOfImages = arrayOfImageElements
.slice(0, limit)
.map(div => div.getElementsByTagName("a"));
let index = 0;
const fetchUrlIntervalId = setInterval(function () {
try {
if (index < arrayOfImages.length) {
const thumbImgEl = arrayOfImages[index++][0]
thumbImgEl.click();
const el = Array.from(document.getElementsByClassName(selectedImageClassName));
const imgEl = el.find(({ src }) => !(src.startsWith("data:") || src.indexOf("encrypted-tbn") >= 0))
if (!imgEl) return;
const url = imgEl.src;
arrayOfUlrs.push(url)
onNext && onNext({ url, thumbImgEl, imgEl })
} else {
clearInterval(fetchUrlIntervalId);
onFinish && onFinish(arrayOfUlrs)
}
} catch (e) {
onError && onError(e)
}
}, interval)
}
}
// example of ussage
scrollToTheEndOrToMax({
onFinish: getFunctionToExtractFullResImages({
onNext: console.log,
onFinish: results => window.__GoogleImagesFullRessImages__ = results,
onError: console.log,
maxImageToProcess: 5,
}),
onError: console.log,
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment