Last active
February 3, 2023 16:16
-
-
Save guillim/cf0de926708746f3d08177cdfeca4b0a to your computer and use it in GitHub Desktop.
Boulanger #dgm #boulanger
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| //------------HOME------------------------------------------------------------------------------------------------ | |
| function case_home(context,typeDeCrawl){ | |
| if(typeDeCrawl && (typeDeCrawl === 'profond' || typeDeCrawl === 'simple')){ | |
| context.skipOutput(); | |
| context.customData.split(',').map(function(keyword) { | |
| enqueueSearch(context,keyword); | |
| }); | |
| }else if(typeDeCrawl && typeDeCrawl === 'produit'){ | |
| context.skipOutput(); | |
| context.customData.split(',').map(function(url) { | |
| enqueueProductUrl(context,url); | |
| }); | |
| }else{ | |
| context.finish({ debugInfo: 'case_home: bug in typeDeCrawl' }); | |
| } | |
| } | |
| //------------SEARCH---------------------------------------------------------------------------------------------- | |
| // function case_search(context,$){ | |
| // var startedAt = Date.now(); | |
| // //context.willFinishLater(); //not needed since the entire process is willfinishlater | |
| // interceptRequestData = context.request.interceptRequestData; | |
| // var f = function() { | |
| // var url,pagesCount; | |
| // var productCountRaw = $(".infoListe span").text(); | |
| // if(productCountRaw){ | |
| // var productCount = (productCountRaw.match(/[0-9, /\s]*résultats/g)) ? productCountRaw.match(/[0-9, /\s]*résultats/g)[0].replace(/[^0-9]/g,'') : 'NA' ; | |
| // productCount = parseInt(productCount); | |
| // pagesCount = (productCount > 15) ? Math.ceil(productCount/20) : 1; | |
| // interceptRequestData.c03_NumberofResults = productCount; | |
| // } | |
| // if( Date.now() - startedAt > 10000 ) { // timeout after 10 seconds | |
| // interceptRequestData.debugInfo = 'case_search: timeout after 10 seconds - is captcha true or false: ' + checkCaptcha($) | |
| // context.finish(interceptRequestData); | |
| // } | |
| // if(productCountRaw && pagesCount){ | |
| // for(i=1; i<2; i++){ | |
| // url = "https://www.boulanger.com/resultats?tr="+ encodeURI( interceptRequestData.c01_keyword.trim().replace(/(\s{1,})/g, '+') )+"&numPage="+i; | |
| // enqueueLabel(context,'searchpagination',url,interceptRequestData); | |
| // } | |
| // context.skipOutput(); | |
| // context.finish(); | |
| // }else{ | |
| // setTimeout(f, 2000); | |
| // } | |
| // }; | |
| // setTimeout(f, 500); | |
| // } | |
| //------------SEARCH PAGINATION----------------------------------------------------------------------------------- | |
| function case_searchpagination(context,$,_,site,typeDeCrawl){ | |
| var startedAt = Date.now(); | |
| interceptRequestData = context.request.interceptRequestData; | |
| var h = function() { | |
| var result = []; | |
| var sponsored = 0; | |
| var notAProductRow = 0; | |
| var position = 0; | |
| if( Date.now() - startedAt > 10000 ) { // timeout after 10 seconds | |
| interceptRequestData.debugInfo = 'case_searchpagination: timeout after 10 seconds - is captcha true or false: ' + checkCaptcha($) | |
| context.finish(interceptRequestData); | |
| } | |
| if($("div.product").length > 0){ | |
| $("div.product").map(function(i) { | |
| var interceptRequestData = {}; | |
| interceptRequestData = _.clone(context.request.interceptRequestData); | |
| interceptRequestData.c02_marketplaceName = 'boulanger.com'; | |
| interceptRequestData.c03_NumberofResults = $(".infoListe span").text(); | |
| if(interceptRequestData.c03_NumberofResults){ | |
| var productCount = (interceptRequestData.c03_NumberofResults.match(/[0-9, /\s]*article/g)) ? interceptRequestData.c03_NumberofResults.match(/[0-9, /\s]*article/g)[0].replace(/[^0-9]/g,'') : 'NA' ; | |
| interceptRequestData.c03_NumberofResults = parseInt(productCount); | |
| } | |
| interceptRequestData.c24_pageNumber = 1 | |
| var h2 = $(this).find('h2 a') | |
| interceptRequestData.c06_itemURL = 'https://www.boulanger.com' + h2.attr('href'); | |
| position++ | |
| interceptRequestData.c07_position = _.clone(position) | |
| interceptRequestData.c04_asin = (h2.attr('href')) ? h2.attr('href').replace(/[^0-9.]*/g,'') : 'NA' | |
| interceptRequestData.c05_itemTitle = removeSpecCharacterIfExist(h2.text().trim()); | |
| interceptRequestData.c09_seller = 'Boulanger' | |
| interceptRequestData.c14_priceRaw = removeSpecCharacterIfExist($(this).find('.priceBarre:eq(0)').text()); | |
| interceptRequestData.c13_priceLow = removeSpecCharacterIfExist($(this).find('.fix-price .exponent:eq(0)').text()) + removeSpecCharacterIfExist($(this).find('.fix-price sup:eq(0)').text()); | |
| var priceNumberLow = interceptRequestData.c13_priceLow.replace(/\./g,"").replace(/,/g,".").replace(/€/g, ".").replace(/\.$/g, "."); | |
| var priceNumberHigh = interceptRequestData.c14_priceRaw.replace(/\./g,"").replace(/,/g,".").replace(/€/g, ".").replace(/\.$/g, "."); | |
| interceptRequestData.c13_priceLow = (priceNumberLow) ? parseFloat(priceNumberLow.replace(/[^0-9.]*/g,'')) : 'NA'; | |
| interceptRequestData.c12_priceHigh = (priceNumberHigh) ? parseFloat(priceNumberHigh.replace(/[^0-9.]*/g,'')) : 'NA'; | |
| interceptRequestData.c15_currency = interceptRequestData.c14_priceRaw.replace(/[0-9., /\s]*/g,''); | |
| interceptRequestData.c15_currency = currencyFormater(interceptRequestData.c15_currency); | |
| interceptRequestData.c08_numberofcomments = $(this).find(".rating > span:contains('avis'):eq(0)").text().trim(); | |
| interceptRequestData.c08_numberofcomments = (interceptRequestData.c08_numberofcomments) ? interceptRequestData.c08_numberofcomments.replace(",","").replace(/[^0-9]/g,'') : 0; | |
| interceptRequestData.c10_star = $(this).find(".rating").attr('class'); | |
| if(interceptRequestData.c10_star){ | |
| interceptRequestData.c10_star = interceptRequestData.c10_star.match(/star_[0-9]*/g) | |
| interceptRequestData.c10_star = (interceptRequestData.c10_star && interceptRequestData.c10_star[0]) ? interceptRequestData.c10_star[0].replace(/[^0-9]/g,'') : false; | |
| interceptRequestData.c10_star = (interceptRequestData.c10_star) ? parseInt(interceptRequestData.c10_star) / 10 : 'NA'; | |
| } | |
| if(typeDeCrawl === 'profond'){ | |
| //on va clicker sur tous les produits inferieurs a 30 -> a reduir a 16 pour se conformer au contrat plus tard | |
| if(interceptRequestData.c06_itemURL && interceptRequestData.c07_position && interceptRequestData.c07_position <= 15){ | |
| enqueueLabelUniqueKey(context,'product',interceptRequestData.c06_itemURL,interceptRequestData); | |
| context.skipOutput(); | |
| }else{ | |
| console.log('typeDeCrawl === profond -------- c06_itemURL est pas defini -------- ou c07_position est superieur a 30 voir pas defini'); | |
| } | |
| //}else if(typeDeCrawl === 'simple'){ | |
| }else{ | |
| if(interceptRequestData.c06_itemURL && interceptRequestData.c07_position && interceptRequestData.c07_position <= 15){ | |
| result.push(_.clone(interceptRequestData)); | |
| }else{ | |
| console.log('typeDeCrawl !== profond-------- c06_itemURL est pas defini -------- ou c07_position est superieur a 30 voir pas defini'); | |
| } | |
| } | |
| }); | |
| context.finish(result); | |
| }else{ | |
| setTimeout(h, 2000); | |
| } | |
| }; | |
| setTimeout(h, 500); | |
| } | |
| //------------PRODUCT----------------------------------------------------------------------------------- | |
| function case_product(context,$,_,site){ | |
| var interceptRequestData = (context.request.interceptRequestData) ? context.request.interceptRequestData : {}; | |
| var startedAt = Date.now(); | |
| //context.willFinishLater(); | |
| var g = function() { | |
| if( Date.now() - startedAt > 10000 ) { // timeout after 10 seconds | |
| interceptRequestData.debugInfo = 'case_product: timeout after 10 seconds - check imageCount ? is captcha true or false:' + checkCaptcha($) | |
| context.finish(interceptRequestData); | |
| } | |
| if($("h1#title").length){ | |
| var imageCountClassic = $("#altImages li.a-declarative").length; | |
| var imageCountAlternatif = $("#imageBlockThumbs img").length; | |
| var imageCount = (imageCountClassic) ? imageCountClassic | |
| : (imageCountAlternatif) ? imageCountAlternatif : 0 | |
| interceptRequestData.c18_nombreDImages = imageCount | |
| interceptRequestData.c16_descriptionProduit1 = removeSpecCharacterIfExist(trimIfExist($("#feature-bullets").text())); | |
| var asin = ( $("#ASIN").attr("value") ) ? $("#ASIN").attr("value") : $("#averageCustomerReviews").attr("data-asin"); | |
| interceptRequestData.c04_asin = (!interceptRequestData.c04_asin || interceptRequestData.c04_asin === '') ? | |
| removeSpecCharacterIfExist(trimIfExist(asin)): | |
| interceptRequestData.c04_asin; | |
| interceptRequestData.c05_itemTitle = (!interceptRequestData.c05_itemTitle || interceptRequestData.c05_itemTitle === '') ? | |
| removeSpecCharacterIfExist(trimIfExist($("h1#title").text())): | |
| interceptRequestData.c05_itemTitle; | |
| var numberofcomments = ( $("h2[data-hook*='review']").text() ) ? $("h2[data-hook*='review']").text().replace(/[^0-9]/g,'') : ''; | |
| interceptRequestData.c08_numberofcomments = (!interceptRequestData.c08_numberofcomments || interceptRequestData.c08_numberofcomments === '') ? | |
| removeSpecCharacterIfExist(trimIfExist(numberofcomments)): | |
| interceptRequestData.c08_numberofcomments; | |
| interceptRequestData.c17_descriptionProduit2 = removeSpecCharacterIfExist(trimIfExist($("#productDescription").text())); | |
| var seller = ( $("#bylineInfo").text() ) ? $("#bylineInfo").text() : ''; | |
| interceptRequestData.c09_seller = (!interceptRequestData.c09_seller || interceptRequestData.c09_seller === '') ? | |
| removeSpecCharacterIfExist(trimIfExist(seller)): | |
| interceptRequestData.c09_seller; | |
| var star = 'NA' | |
| try { | |
| star = $('.AverageCustomerReviews i.a-icon > span.a-icon-alt').text().trim().replace(/(out of)|(sur)|(.?toile.?)|(star.?)|( 5)/g,''); | |
| } | |
| catch(e) { console.error(e); } | |
| interceptRequestData.c10_star = (!interceptRequestData.c10_star || interceptRequestData.c10_star === '') ? | |
| removeSpecCharacterIfExist(trimIfExist(star)): | |
| interceptRequestData.c10_star; | |
| var c14_priceRaw = removeSpecCharacterIfExist($('span.a-color-price:eq(0)').text()); | |
| var priceNumber = c14_priceRaw.replace(/EUR/g, "").replace(/\./g,"").replace(/,/g,"."); | |
| var priceNumberLow = (priceNumber) ? priceNumber.split("-")[0] : false; | |
| var priceNumberHigh = (priceNumber) ? priceNumber.split("-")[1] : false; | |
| var c13_priceLow = (priceNumberLow) ? parseFloat(priceNumberLow.replace(/[^0-9.]*/g,'')) : 'NA'; | |
| var c12_priceHigh = (priceNumberHigh) ? parseFloat(priceNumberHigh.replace(/[^0-9.]*/g,'')) : 'NA'; | |
| var c15_currency = c14_priceRaw.replace(/[0-9., /\s]*/g,''); | |
| c15_currency = currencyFormater(c15_currency); | |
| interceptRequestData.c12_priceHigh = (!interceptRequestData.c12_priceHigh || interceptRequestData.c12_priceHigh === '') ? | |
| c12_priceHigh: | |
| interceptRequestData.c12_priceHigh; | |
| interceptRequestData.c13_priceLow = (!interceptRequestData.c13_priceLow || interceptRequestData.c13_priceLow === '') ? | |
| c13_priceLow: | |
| interceptRequestData.c13_priceLow; | |
| interceptRequestData.c14_priceRaw = (!interceptRequestData.c14_priceRaw || interceptRequestData.c14_priceRaw === '') ? | |
| c14_priceRaw: | |
| interceptRequestData.c14_priceRaw; | |
| interceptRequestData.c15_currency = (!interceptRequestData.c15_currency || interceptRequestData.c15_currency === '') ? | |
| c15_currency: | |
| interceptRequestData.c15_currency; | |
| var repartition = {} ; | |
| if(site === 'amazon.fr'){ | |
| repartition.star5 = ($(".5star:contains('%')").text() ) ? $(".5star:contains('%')").text().match(/[0-9]*%/g)[0] : null; | |
| repartition.star4 = ($(".4star:contains('%')").text() ) ? $(".4star:contains('%')").text().match(/[0-9]*%/g)[0] : null; | |
| repartition.star3 = ($(".3star:contains('%')").text() ) ? $(".3star:contains('%')").text().match(/[0-9]*%/g)[0] : null; | |
| repartition.star2 = ($(".2star:contains('%')").text() ) ? $(".2star:contains('%')").text().match(/[0-9]*%/g)[0] : null; | |
| repartition.star1 = ($(".1star:contains('%')").text() ) ? $(".1star:contains('%')").text().match(/[0-9]*%/g)[0] : null; | |
| }else{ | |
| repartition.star5 = trimIfExist($("a.5star").text()).replace("5 star",""); | |
| repartition.star4 = trimIfExist($("a.4star").text()).replace("4 star",""); | |
| repartition.star3 = trimIfExist($("a.3star").text()).replace("3 star",""); | |
| repartition.star2 = trimIfExist($("a.2star").text()).replace("2 star",""); | |
| repartition.star1 = trimIfExist($("a.1star").text()).replace("1 star",""); | |
| } | |
| interceptRequestData.c11_repartition = nullIfnothingOrNARepartitionStar(repartition,_); | |
| var rawText = removeSpecCharacterIfExist(trimIfExist($("#SalesRank .value").text() )); | |
| if(!rawText){ | |
| rawText = removeSpecCharacterIfExist(trimIfExist( $("#SalesRank").clone().children().remove().end().text().trim() +' '+ $('#SalesRank ul').text().trim() )); | |
| rawText = (rawText) ? rawText.replace('()','') : rawText; | |
| } | |
| var rawTextMatch_all = rawText.split(/(?:n°)/); | |
| interceptRequestData.c21_Bestseller = []; | |
| if($("#merchant-info").text() ){ | |
| interceptRequestData.c25_soldAndSentBy = (removeSpecCharacterIfExist(trimIfExist($("#merchant-info").text() ))) ? removeSpecCharacterIfExist(trimIfExist($("#merchant-info").text() )).replace(/\n/g,'') : 'NA'; | |
| } | |
| if(rawTextMatch_all.length === 0){ | |
| interceptRequestData.c19_MainCategorie = (removeSpecCharacterIfExist(trimIfExist($("#SalesRank .value").text() ))) ? removeSpecCharacterIfExist(trimIfExist($("#SalesRank .value").text() )).replace(/^[0-9.,]*\sen\s/g,'').replace('(Voir les 100 premiers)',''): 'NA'; | |
| interceptRequestData.c20_Maincategorieposition = (removeSpecCharacterIfExist(trimIfExist($("#SalesRank .value").text() ))) ? removeSpecCharacterIfExist(trimIfExist($("#SalesRank .value").text() )).match(/^[0-9.,]*/g)[0]: 'NA'; | |
| } | |
| rawTextMatch_all.forEach(function(d) { | |
| e = removeSpecCharacterIfExist(trimIfExist(d)); | |
| if (e.search(/^[0-9.,]*\sdans\s/g) != -1) { | |
| var BestsellerObj = {}; | |
| BestsellerObj.Breadcrums = e.replace(/^[0-9.,]*\sdans\s/g,''); | |
| BestsellerObj.rank = e.match(/^[0-9.,]*/g)[0]; | |
| interceptRequestData.c21_Bestseller.push( BestsellerObj ); | |
| } else if (e.search(/^[0-9.,]*\sen\s/g) != -1){ | |
| interceptRequestData.c19_MainCategorie = e.replace(/^[0-9.,]*\sen\s/g,'').replace('(Voir les 100 premiers)',''); | |
| interceptRequestData.c20_Maincategorieposition = e.match(/^[0-9.,]*/g)[0]; | |
| } | |
| }); | |
| interceptRequestData.c22_reviews = []; | |
| if(site === 'amazon.fr'){ | |
| $(".a-section.review").map(function() { | |
| var review = {}; | |
| review.c06_title = $(this).find('.a-row .review-title').text(); | |
| review.c03_note = $(this).find('i.a-icon-star > span.a-icon-alt').text().trim().replace(/out of 5 stars|étoiles sur 5/g,''); | |
| review.c05_texte = $(this).find('div.a-section .review-text').text().trim(); | |
| review.c04_type = removeSpecCharacterIfExist(trimIfExist($(this).find(".badges-genome-widget").text())); | |
| review.c02_helpfulReview = removeSpecCharacterIfExist(trimIfExist($(this).find(".cr-vote-buttons").text())); | |
| review.c02_helpfulReview = (review.c02_helpfulReview && review.c02_helpfulReview.match(/.*cela utile/g) ) ? review.c02_helpfulReview.match(/.*cela utile/g)[0] : 'NA' ; | |
| review.c07_verified = ($(this).find("span:contains('Achat vérifié')").length > 0) ? true : false; | |
| interceptRequestData.c22_reviews.push( review ); | |
| }); | |
| }else{ | |
| $("#cm-cr-review-list div.review").map(function() { | |
| var review = {}; | |
| review.c06_title = $(this).find('a.review-title').text(); | |
| review.c03_note = $(this).find('i.a-icon-star > span.a-icon-alt').text().trim().replace(/out of 5 stars|étoiles sur 5/g,''); | |
| review.c05_texte = $(this).find('.review-data .a-expander-content').text(); | |
| review.c04_type = removeSpecCharacterIfExist(trimIfExist($(this).find(".review-format-strip").text())); | |
| review.c02_helpfulReview = removeSpecCharacterIfExist(trimIfExist($(this).find(".review-votes").text())); | |
| review.c07_verified = ($(this).find(".review-format-strip:contains('Verified Purchase'),.review-format-strip:contains('Achat vérifié')").length > 0) ? true : false; | |
| interceptRequestData.c22_reviews.push( review ); | |
| }); | |
| } | |
| context.finish(interceptRequestData); | |
| }else{ | |
| setTimeout(g, 2000); | |
| } | |
| }; | |
| g(); | |
| } | |
| //------------------------------------------------------------------------------------------------ | |
| //-----------------------------------function----------------------------------------------------- | |
| //------------------------------------------------------------------------------------------------ | |
| function enqueueSearch(context,keyword) { | |
| var searchUrl = 'https://www.boulanger.com/resultats?tr='; | |
| var encodedKeyword = encodeURI( keyword.trim().replace(/(\s{1,})/g, '+') ); | |
| context.enqueuePage({ | |
| label: 'searchpagination', | |
| url: searchUrl + encodedKeyword, | |
| interceptRequestData: { c01_keyword: keyword } | |
| }); | |
| } | |
| function enqueueProductUrl(context,url) { | |
| context.enqueuePage({ | |
| label: 'product', | |
| url: url, | |
| interceptRequestData: { c06_itemURL: url, c02_marketplaceName: 'boulanger.com' } | |
| }); | |
| } | |
| function enqueueLabel(context,label,url,interceptRequestData) { | |
| context.enqueuePage({ | |
| label: label, | |
| url: url, | |
| interceptRequestData: interceptRequestData | |
| }); | |
| } | |
| function enqueueLabelUniqueKey(context,label,url,interceptRequestData) { | |
| context.enqueuePage({ | |
| label: label, | |
| url: url, | |
| uniqueKey: url + interceptRequestData.itemURL + Math.floor(Math.random() * 1000000000000000), | |
| interceptRequestData: interceptRequestData | |
| // ,queuePosition: "LAST" | |
| }); | |
| } | |
| function replaceByfalseifneeded(text) { | |
| return (text && text === '') ? false : text ; | |
| } | |
| function trimIfExist(text) { | |
| return (text) ? replaceByfalseifneeded(text.trim()) : text ; | |
| } | |
| function removeSpecCharacterIfExist(text) { | |
| return (text) ? text.replace(/(\s\s+|\\n)/gi, ' ') : text ; | |
| } | |
| function nullIfnothingOrNARepartitionStar(obj,_){ | |
| var obj2 = _.clone(obj); | |
| var count = 0; | |
| for(var property in obj) { | |
| if(obj[property] === false || obj[property] === null || obj[property] === ''){ | |
| obj2[property] = "0%"; | |
| count++; | |
| } | |
| } | |
| if(count === 5){ //si jamais on a 5 fois rien, alors on met 'not available' | |
| for(var property in obj) { obj2[property] = 'NA'; } | |
| } | |
| return obj2; | |
| } | |
| function currencyFormater(text) { | |
| var arr = text.split(/-/g); | |
| return (arr.length > 1 && arr[0] === arr[1] ) ? arr[0] : text ; | |
| } | |
| //captcha alert | |
| function checkCaptcha($){ | |
| if ( $('div:contains("make sure you\'re not a robot.")').length !== 0 | |
| || $('div:contains("ne suis pas un robot")').length !== 0 | |
| || $('div:contains("n\'êtes pas un robot")').length !== 0 | |
| || $('div:contains("caractères que vous voyez")').length !== 0 | |
| || $('div:contains("the characters you see")').length !== 0 | |
| || $('div:contains("les caractères affichés")').length !== 0) { | |
| return true | |
| }else{ | |
| return false | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment