Skip to content

Instantly share code, notes, and snippets.

@guillim
Last active February 3, 2023 16:16
Show Gist options
  • Select an option

  • Save guillim/cf0de926708746f3d08177cdfeca4b0a to your computer and use it in GitHub Desktop.

Select an option

Save guillim/cf0de926708746f3d08177cdfeca4b0a to your computer and use it in GitHub Desktop.
Boulanger #dgm #boulanger
//------------HOME------------------------------------------------------------------------------------------------
function case_home(context,typeDeCrawl){
if(typeDeCrawl && (typeDeCrawl === 'profond' || typeDeCrawl === 'simple')){
context.skipOutput();
context.customData.split(',').map(function(keyword) {
enqueueSearch(context,keyword);
});
}else if(typeDeCrawl && typeDeCrawl === 'produit'){
context.skipOutput();
context.customData.split(',').map(function(url) {
enqueueProductUrl(context,url);
});
}else{
context.finish({ debugInfo: 'case_home: bug in typeDeCrawl' });
}
}
//------------SEARCH----------------------------------------------------------------------------------------------
// function case_search(context,$){
// var startedAt = Date.now();
// //context.willFinishLater(); //not needed since the entire process is willfinishlater
// interceptRequestData = context.request.interceptRequestData;
// var f = function() {
// var url,pagesCount;
// var productCountRaw = $(".infoListe span").text();
// if(productCountRaw){
// var productCount = (productCountRaw.match(/[0-9, /\s]*résultats/g)) ? productCountRaw.match(/[0-9, /\s]*résultats/g)[0].replace(/[^0-9]/g,'') : 'NA' ;
// productCount = parseInt(productCount);
// pagesCount = (productCount > 15) ? Math.ceil(productCount/20) : 1;
// interceptRequestData.c03_NumberofResults = productCount;
// }
// if( Date.now() - startedAt > 10000 ) { // timeout after 10 seconds
// interceptRequestData.debugInfo = 'case_search: timeout after 10 seconds - is captcha true or false: ' + checkCaptcha($)
// context.finish(interceptRequestData);
// }
// if(productCountRaw && pagesCount){
// for(i=1; i<2; i++){
// url = "https://www.boulanger.com/resultats?tr="+ encodeURI( interceptRequestData.c01_keyword.trim().replace(/(\s{1,})/g, '+') )+"&numPage="+i;
// enqueueLabel(context,'searchpagination',url,interceptRequestData);
// }
// context.skipOutput();
// context.finish();
// }else{
// setTimeout(f, 2000);
// }
// };
// setTimeout(f, 500);
// }
//------------SEARCH PAGINATION-----------------------------------------------------------------------------------
function case_searchpagination(context,$,_,site,typeDeCrawl){
var startedAt = Date.now();
interceptRequestData = context.request.interceptRequestData;
var h = function() {
var result = [];
var sponsored = 0;
var notAProductRow = 0;
var position = 0;
if( Date.now() - startedAt > 10000 ) { // timeout after 10 seconds
interceptRequestData.debugInfo = 'case_searchpagination: timeout after 10 seconds - is captcha true or false: ' + checkCaptcha($)
context.finish(interceptRequestData);
}
if($("div.product").length > 0){
$("div.product").map(function(i) {
var interceptRequestData = {};
interceptRequestData = _.clone(context.request.interceptRequestData);
interceptRequestData.c02_marketplaceName = 'boulanger.com';
interceptRequestData.c03_NumberofResults = $(".infoListe span").text();
if(interceptRequestData.c03_NumberofResults){
var productCount = (interceptRequestData.c03_NumberofResults.match(/[0-9, /\s]*article/g)) ? interceptRequestData.c03_NumberofResults.match(/[0-9, /\s]*article/g)[0].replace(/[^0-9]/g,'') : 'NA' ;
interceptRequestData.c03_NumberofResults = parseInt(productCount);
}
interceptRequestData.c24_pageNumber = 1
var h2 = $(this).find('h2 a')
interceptRequestData.c06_itemURL = 'https://www.boulanger.com' + h2.attr('href');
position++
interceptRequestData.c07_position = _.clone(position)
interceptRequestData.c04_asin = (h2.attr('href')) ? h2.attr('href').replace(/[^0-9.]*/g,'') : 'NA'
interceptRequestData.c05_itemTitle = removeSpecCharacterIfExist(h2.text().trim());
interceptRequestData.c09_seller = 'Boulanger'
interceptRequestData.c14_priceRaw = removeSpecCharacterIfExist($(this).find('.priceBarre:eq(0)').text());
interceptRequestData.c13_priceLow = removeSpecCharacterIfExist($(this).find('.fix-price .exponent:eq(0)').text()) + removeSpecCharacterIfExist($(this).find('.fix-price sup:eq(0)').text());
var priceNumberLow = interceptRequestData.c13_priceLow.replace(/\./g,"").replace(/,/g,".").replace(/€/g, ".").replace(/\.$/g, ".");
var priceNumberHigh = interceptRequestData.c14_priceRaw.replace(/\./g,"").replace(/,/g,".").replace(/€/g, ".").replace(/\.$/g, ".");
interceptRequestData.c13_priceLow = (priceNumberLow) ? parseFloat(priceNumberLow.replace(/[^0-9.]*/g,'')) : 'NA';
interceptRequestData.c12_priceHigh = (priceNumberHigh) ? parseFloat(priceNumberHigh.replace(/[^0-9.]*/g,'')) : 'NA';
interceptRequestData.c15_currency = interceptRequestData.c14_priceRaw.replace(/[0-9., /\s]*/g,'');
interceptRequestData.c15_currency = currencyFormater(interceptRequestData.c15_currency);
interceptRequestData.c08_numberofcomments = $(this).find(".rating > span:contains('avis'):eq(0)").text().trim();
interceptRequestData.c08_numberofcomments = (interceptRequestData.c08_numberofcomments) ? interceptRequestData.c08_numberofcomments.replace(",","").replace(/[^0-9]/g,'') : 0;
interceptRequestData.c10_star = $(this).find(".rating").attr('class');
if(interceptRequestData.c10_star){
interceptRequestData.c10_star = interceptRequestData.c10_star.match(/star_[0-9]*/g)
interceptRequestData.c10_star = (interceptRequestData.c10_star && interceptRequestData.c10_star[0]) ? interceptRequestData.c10_star[0].replace(/[^0-9]/g,'') : false;
interceptRequestData.c10_star = (interceptRequestData.c10_star) ? parseInt(interceptRequestData.c10_star) / 10 : 'NA';
}
if(typeDeCrawl === 'profond'){
//on va clicker sur tous les produits inferieurs a 30 -> a reduir a 16 pour se conformer au contrat plus tard
if(interceptRequestData.c06_itemURL && interceptRequestData.c07_position && interceptRequestData.c07_position <= 15){
enqueueLabelUniqueKey(context,'product',interceptRequestData.c06_itemURL,interceptRequestData);
context.skipOutput();
}else{
console.log('typeDeCrawl === profond -------- c06_itemURL est pas defini -------- ou c07_position est superieur a 30 voir pas defini');
}
//}else if(typeDeCrawl === 'simple'){
}else{
if(interceptRequestData.c06_itemURL && interceptRequestData.c07_position && interceptRequestData.c07_position <= 15){
result.push(_.clone(interceptRequestData));
}else{
console.log('typeDeCrawl !== profond-------- c06_itemURL est pas defini -------- ou c07_position est superieur a 30 voir pas defini');
}
}
});
context.finish(result);
}else{
setTimeout(h, 2000);
}
};
setTimeout(h, 500);
}
//------------PRODUCT-----------------------------------------------------------------------------------
function case_product(context,$,_,site){
var interceptRequestData = (context.request.interceptRequestData) ? context.request.interceptRequestData : {};
var startedAt = Date.now();
//context.willFinishLater();
var g = function() {
if( Date.now() - startedAt > 10000 ) { // timeout after 10 seconds
interceptRequestData.debugInfo = 'case_product: timeout after 10 seconds - check imageCount ? is captcha true or false:' + checkCaptcha($)
context.finish(interceptRequestData);
}
if($("h1#title").length){
var imageCountClassic = $("#altImages li.a-declarative").length;
var imageCountAlternatif = $("#imageBlockThumbs img").length;
var imageCount = (imageCountClassic) ? imageCountClassic
: (imageCountAlternatif) ? imageCountAlternatif : 0
interceptRequestData.c18_nombreDImages = imageCount
interceptRequestData.c16_descriptionProduit1 = removeSpecCharacterIfExist(trimIfExist($("#feature-bullets").text()));
var asin = ( $("#ASIN").attr("value") ) ? $("#ASIN").attr("value") : $("#averageCustomerReviews").attr("data-asin");
interceptRequestData.c04_asin = (!interceptRequestData.c04_asin || interceptRequestData.c04_asin === '') ?
removeSpecCharacterIfExist(trimIfExist(asin)):
interceptRequestData.c04_asin;
interceptRequestData.c05_itemTitle = (!interceptRequestData.c05_itemTitle || interceptRequestData.c05_itemTitle === '') ?
removeSpecCharacterIfExist(trimIfExist($("h1#title").text())):
interceptRequestData.c05_itemTitle;
var numberofcomments = ( $("h2[data-hook*='review']").text() ) ? $("h2[data-hook*='review']").text().replace(/[^0-9]/g,'') : '';
interceptRequestData.c08_numberofcomments = (!interceptRequestData.c08_numberofcomments || interceptRequestData.c08_numberofcomments === '') ?
removeSpecCharacterIfExist(trimIfExist(numberofcomments)):
interceptRequestData.c08_numberofcomments;
interceptRequestData.c17_descriptionProduit2 = removeSpecCharacterIfExist(trimIfExist($("#productDescription").text()));
var seller = ( $("#bylineInfo").text() ) ? $("#bylineInfo").text() : '';
interceptRequestData.c09_seller = (!interceptRequestData.c09_seller || interceptRequestData.c09_seller === '') ?
removeSpecCharacterIfExist(trimIfExist(seller)):
interceptRequestData.c09_seller;
var star = 'NA'
try {
star = $('.AverageCustomerReviews i.a-icon > span.a-icon-alt').text().trim().replace(/(out of)|(sur)|(.?toile.?)|(star.?)|( 5)/g,'');
}
catch(e) { console.error(e); }
interceptRequestData.c10_star = (!interceptRequestData.c10_star || interceptRequestData.c10_star === '') ?
removeSpecCharacterIfExist(trimIfExist(star)):
interceptRequestData.c10_star;
var c14_priceRaw = removeSpecCharacterIfExist($('span.a-color-price:eq(0)').text());
var priceNumber = c14_priceRaw.replace(/EUR/g, "").replace(/\./g,"").replace(/,/g,".");
var priceNumberLow = (priceNumber) ? priceNumber.split("-")[0] : false;
var priceNumberHigh = (priceNumber) ? priceNumber.split("-")[1] : false;
var c13_priceLow = (priceNumberLow) ? parseFloat(priceNumberLow.replace(/[^0-9.]*/g,'')) : 'NA';
var c12_priceHigh = (priceNumberHigh) ? parseFloat(priceNumberHigh.replace(/[^0-9.]*/g,'')) : 'NA';
var c15_currency = c14_priceRaw.replace(/[0-9., /\s]*/g,'');
c15_currency = currencyFormater(c15_currency);
interceptRequestData.c12_priceHigh = (!interceptRequestData.c12_priceHigh || interceptRequestData.c12_priceHigh === '') ?
c12_priceHigh:
interceptRequestData.c12_priceHigh;
interceptRequestData.c13_priceLow = (!interceptRequestData.c13_priceLow || interceptRequestData.c13_priceLow === '') ?
c13_priceLow:
interceptRequestData.c13_priceLow;
interceptRequestData.c14_priceRaw = (!interceptRequestData.c14_priceRaw || interceptRequestData.c14_priceRaw === '') ?
c14_priceRaw:
interceptRequestData.c14_priceRaw;
interceptRequestData.c15_currency = (!interceptRequestData.c15_currency || interceptRequestData.c15_currency === '') ?
c15_currency:
interceptRequestData.c15_currency;
var repartition = {} ;
if(site === 'amazon.fr'){
repartition.star5 = ($(".5star:contains('%')").text() ) ? $(".5star:contains('%')").text().match(/[0-9]*%/g)[0] : null;
repartition.star4 = ($(".4star:contains('%')").text() ) ? $(".4star:contains('%')").text().match(/[0-9]*%/g)[0] : null;
repartition.star3 = ($(".3star:contains('%')").text() ) ? $(".3star:contains('%')").text().match(/[0-9]*%/g)[0] : null;
repartition.star2 = ($(".2star:contains('%')").text() ) ? $(".2star:contains('%')").text().match(/[0-9]*%/g)[0] : null;
repartition.star1 = ($(".1star:contains('%')").text() ) ? $(".1star:contains('%')").text().match(/[0-9]*%/g)[0] : null;
}else{
repartition.star5 = trimIfExist($("a.5star").text()).replace("5 star","");
repartition.star4 = trimIfExist($("a.4star").text()).replace("4 star","");
repartition.star3 = trimIfExist($("a.3star").text()).replace("3 star","");
repartition.star2 = trimIfExist($("a.2star").text()).replace("2 star","");
repartition.star1 = trimIfExist($("a.1star").text()).replace("1 star","");
}
interceptRequestData.c11_repartition = nullIfnothingOrNARepartitionStar(repartition,_);
var rawText = removeSpecCharacterIfExist(trimIfExist($("#SalesRank .value").text() ));
if(!rawText){
rawText = removeSpecCharacterIfExist(trimIfExist( $("#SalesRank").clone().children().remove().end().text().trim() +' '+ $('#SalesRank ul').text().trim() ));
rawText = (rawText) ? rawText.replace('()','') : rawText;
}
var rawTextMatch_all = rawText.split(/(?:n°)/);
interceptRequestData.c21_Bestseller = [];
if($("#merchant-info").text() ){
interceptRequestData.c25_soldAndSentBy = (removeSpecCharacterIfExist(trimIfExist($("#merchant-info").text() ))) ? removeSpecCharacterIfExist(trimIfExist($("#merchant-info").text() )).replace(/\n/g,'') : 'NA';
}
if(rawTextMatch_all.length === 0){
interceptRequestData.c19_MainCategorie = (removeSpecCharacterIfExist(trimIfExist($("#SalesRank .value").text() ))) ? removeSpecCharacterIfExist(trimIfExist($("#SalesRank .value").text() )).replace(/^[0-9.,]*\sen\s/g,'').replace('(Voir les 100 premiers)',''): 'NA';
interceptRequestData.c20_Maincategorieposition = (removeSpecCharacterIfExist(trimIfExist($("#SalesRank .value").text() ))) ? removeSpecCharacterIfExist(trimIfExist($("#SalesRank .value").text() )).match(/^[0-9.,]*/g)[0]: 'NA';
}
rawTextMatch_all.forEach(function(d) {
e = removeSpecCharacterIfExist(trimIfExist(d));
if (e.search(/^[0-9.,]*\sdans\s/g) != -1) {
var BestsellerObj = {};
BestsellerObj.Breadcrums = e.replace(/^[0-9.,]*\sdans\s/g,'');
BestsellerObj.rank = e.match(/^[0-9.,]*/g)[0];
interceptRequestData.c21_Bestseller.push( BestsellerObj );
} else if (e.search(/^[0-9.,]*\sen\s/g) != -1){
interceptRequestData.c19_MainCategorie = e.replace(/^[0-9.,]*\sen\s/g,'').replace('(Voir les 100 premiers)','');
interceptRequestData.c20_Maincategorieposition = e.match(/^[0-9.,]*/g)[0];
}
});
interceptRequestData.c22_reviews = [];
if(site === 'amazon.fr'){
$(".a-section.review").map(function() {
var review = {};
review.c06_title = $(this).find('.a-row .review-title').text();
review.c03_note = $(this).find('i.a-icon-star > span.a-icon-alt').text().trim().replace(/out of 5 stars|étoiles sur 5/g,'');
review.c05_texte = $(this).find('div.a-section .review-text').text().trim();
review.c04_type = removeSpecCharacterIfExist(trimIfExist($(this).find(".badges-genome-widget").text()));
review.c02_helpfulReview = removeSpecCharacterIfExist(trimIfExist($(this).find(".cr-vote-buttons").text()));
review.c02_helpfulReview = (review.c02_helpfulReview && review.c02_helpfulReview.match(/.*cela utile/g) ) ? review.c02_helpfulReview.match(/.*cela utile/g)[0] : 'NA' ;
review.c07_verified = ($(this).find("span:contains('Achat vérifié')").length > 0) ? true : false;
interceptRequestData.c22_reviews.push( review );
});
}else{
$("#cm-cr-review-list div.review").map(function() {
var review = {};
review.c06_title = $(this).find('a.review-title').text();
review.c03_note = $(this).find('i.a-icon-star > span.a-icon-alt').text().trim().replace(/out of 5 stars|étoiles sur 5/g,'');
review.c05_texte = $(this).find('.review-data .a-expander-content').text();
review.c04_type = removeSpecCharacterIfExist(trimIfExist($(this).find(".review-format-strip").text()));
review.c02_helpfulReview = removeSpecCharacterIfExist(trimIfExist($(this).find(".review-votes").text()));
review.c07_verified = ($(this).find(".review-format-strip:contains('Verified Purchase'),.review-format-strip:contains('Achat vérifié')").length > 0) ? true : false;
interceptRequestData.c22_reviews.push( review );
});
}
context.finish(interceptRequestData);
}else{
setTimeout(g, 2000);
}
};
g();
}
//------------------------------------------------------------------------------------------------
//-----------------------------------function-----------------------------------------------------
//------------------------------------------------------------------------------------------------
function enqueueSearch(context,keyword) {
var searchUrl = 'https://www.boulanger.com/resultats?tr=';
var encodedKeyword = encodeURI( keyword.trim().replace(/(\s{1,})/g, '+') );
context.enqueuePage({
label: 'searchpagination',
url: searchUrl + encodedKeyword,
interceptRequestData: { c01_keyword: keyword }
});
}
function enqueueProductUrl(context,url) {
context.enqueuePage({
label: 'product',
url: url,
interceptRequestData: { c06_itemURL: url, c02_marketplaceName: 'boulanger.com' }
});
}
function enqueueLabel(context,label,url,interceptRequestData) {
context.enqueuePage({
label: label,
url: url,
interceptRequestData: interceptRequestData
});
}
function enqueueLabelUniqueKey(context,label,url,interceptRequestData) {
context.enqueuePage({
label: label,
url: url,
uniqueKey: url + interceptRequestData.itemURL + Math.floor(Math.random() * 1000000000000000),
interceptRequestData: interceptRequestData
// ,queuePosition: "LAST"
});
}
function replaceByfalseifneeded(text) {
return (text && text === '') ? false : text ;
}
function trimIfExist(text) {
return (text) ? replaceByfalseifneeded(text.trim()) : text ;
}
function removeSpecCharacterIfExist(text) {
return (text) ? text.replace(/(\s\s+|\\n)/gi, ' ') : text ;
}
function nullIfnothingOrNARepartitionStar(obj,_){
var obj2 = _.clone(obj);
var count = 0;
for(var property in obj) {
if(obj[property] === false || obj[property] === null || obj[property] === ''){
obj2[property] = "0%";
count++;
}
}
if(count === 5){ //si jamais on a 5 fois rien, alors on met 'not available'
for(var property in obj) { obj2[property] = 'NA'; }
}
return obj2;
}
function currencyFormater(text) {
var arr = text.split(/-/g);
return (arr.length > 1 && arr[0] === arr[1] ) ? arr[0] : text ;
}
//captcha alert
function checkCaptcha($){
if ( $('div:contains("make sure you\'re not a robot.")').length !== 0
|| $('div:contains("ne suis pas un robot")').length !== 0
|| $('div:contains("n\'êtes pas un robot")').length !== 0
|| $('div:contains("caractères que vous voyez")').length !== 0
|| $('div:contains("the characters you see")').length !== 0
|| $('div:contains("les caractères affichés")').length !== 0) {
return true
}else{
return false
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment