How to scrape product characteristics on Walmart?
Mikhail Zub
Posted on October 25, 2021
Intro
In this post, I want to explain how to scrape product characteristics on Walmart with Node.js. I will show you several ways to do this.
Preparation
First, we need to create a Node.js project and add npm packages "Puppeeteer" and "Puppeteer stealth plugin". To do this, in the directory with our project, open the command line and enter:
npm init -y
then:
npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth
What will be scraped
Walmart organic results
then the characteristics of each product
Process
SelectorGadget Chrome extension was used to grab CSS selectors.
The Gif below illustrates the approach of selecting different parts of the organic results.
Code
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
const URL = "https://www.walmart.com/";
const searchString = "laptop";
const encodedString = encodeURI(searchString);
puppeteer.use(StealthPlugin());
async function solveCaptcha(page, url) {
const currentPage = await page.evaluate(() => location.href);
if (currentPage != url) {
await page.waitForSelector("#px-captcha");
await page.waitForTimeout(3000);
await page.keyboard.press("Tab");
await page.keyboard.press("Tab");
await page.keyboard.press("Enter", { delay: 10000 });
await page.waitForTimeout(5000);
}
return;
}
async function solveSecondCaptcha(page, url, searchQuery) {
const currentPage = await page.evaluate(() => location.href);
if (currentPage.indexOf("?query=") != "-1") {
if (currentPage != `${url}search/?query=${searchQuery}`) {
await page.waitForSelector("#px-captcha");
await page.waitForTimeout(3000);
await page.keyboard.press("Tab");
await page.keyboard.press("Tab");
await page.keyboard.press("Enter", { delay: 10000 });
await page.keyboard.up("Enter");
await page.waitForTimeout(5000);
}
}
if (currentPage.indexOf("?q=") != "-1") {
if (currentPage != `${url}search?q=${searchQuery}`) {
await page.waitForSelector("#px-captcha");
await page.waitForTimeout(3000);
await page.keyboard.press("Tab");
await page.keyboard.press("Tab");
await page.keyboard.press("Enter", { delay: 10000 });
await page.keyboard.up("Enter");
await page.waitForTimeout(5000);
}
}
return;
}
async function getOrganicResults(url, searchQuery) {
browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(60000);
await page.goto(url);
await solveCaptcha(page, url);
await page.waitForSelector("input[name='q']");
await page.type("input[name='q']", searchQuery, { delay: 10 });
await page.keyboard.press("Enter");
await page.waitForTimeout(10000);
await solveSecondCaptcha(page, url, searchQuery);
await page.waitForSelector("div[data-item-id]");
await page.waitForTimeout(5000);
const products = await page.evaluate(function () {
return Array.from(document.querySelectorAll("div[data-stack-index] div[data-item-id]")).map((el) => ({
link: "https://www.walmart.com" + el.querySelector("a[link-identifier]").getAttribute("href"),
title: el.querySelectorAll(".lh-title")[1].innerText,
price: el.querySelector(".f4-l").innerText,
}));
});
return products;
}
async function getProductsSpecs(product) {
if (!product.link.toLowerCase().includes("https://wrd.walmart.com/")) {
const url = product.link;
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(60000);
await page.goto(url);
await solveCaptcha(page, url);
await page.waitForTimeout(10000);
const selectors1 = await page.$$(".product-specification-table td");
const productInfo = {};
let keys = [];
let values = [];
if (selectors1.length > 1) {
keys = await page.evaluate(function () {
return Array.from(document.querySelectorAll(".product-specification-table td:first-child")).map((el) => el.innerText);
});
values = await page.evaluate(function () {
return Array.from(document.querySelectorAll(".product-specification-table td:last-child")).map((el) => el.innerText);
});
} else {
keys = await page.evaluate(function () {
return Array.from(document.querySelectorAll(".nt1 .pb2 h3")).map((el) => el.innerText);
});
values = await page.evaluate(function () {
return Array.from(document.querySelectorAll(".nt1 .pb2 p span")).map((el) => el.innerText);
});
}
await page.close();
productInfo.link = url;
productInfo.title = product.title;
productInfo.price = product.price;
productInfo.specifications = {};
for (let i = 0; i < keys.length; i++) {
productInfo.specifications[`${keys[i]}`] = values[i] ? values[i].trim() : "no spec";
}
console.log(productInfo);
return productInfo;
}
return;
}
async function getAllSpecs(url, searchQuery) {
const mainProductInfo = await getOrganicResults(url, searchQuery);
const products = [];
for (let i = 0; i < mainProductInfo.length; i++) {
products.push(await getProductsSpecs(mainProductInfo[i]));
}
await browser.close();
}
getAllSpecs(URL, encodedString);
Output
{
link: 'https://www.walmart.com/ip/HP-15-Pentium-4GB-128GB-Laptop-Scarlet-Red/307924252',
title: 'HP 15 Pentium 4GB/128GB Laptop-Scarlet Red',
price: '$299.00',
specifications: {
'Processor Brand': 'Intel',
'Processor Type': 'Intel',
'Hard Drive Capacity': '128 GB',
'Processor Core Type': 'Quad-Core',
Manufacturer: 'HP',
'Product Line': 'HP',
'Processor Speed': '1.1 GHz',
'Manufacturer Part Number': '1A493UA#ABA',
'Wireless Technology': '802.11b',
'RAM Memory': '4 GB',
'Maximum RAM Supported': '4 GB',
'Operating System': 'Windows 10',
'Battery Life': '10.45 h',
Model: '15-dw0083wm',
'Screen Size': '15.6 in',
Brand: 'HP',
Features: 'No features description available',
'Assembled Product Dimensions (L x W x H)': '22.00 x 4.00 x 14.00 Inches'
}
}
Using Walmart Search Engine Results API
SerpApi is a free API with 100 search per month. If you need more searches, there are paid plans.
The difference is that all that needs to be done is just to iterate over a ready made, structured JSON instead of coding everything from scratch, and selecting correct selectors which could be time consuming at times.
First we need to install "google-search-results-nodejs". To do this you need to enter:
npm i google-search-results-nodejs
Code
const util = require("util");
const { GoogleSearch } = require("google-search-results-nodejs");
const search = new GoogleSearch(process.env.API_KEY); //Your API key
const getJson = search.json.bind(search);
getJson[util.promisify.custom] = (params) => {
return new Promise((resolve, reject) => {
getJson(params, resolve, reject);
});
};
const promisifiedGetJson = util.promisify(getJson);
async function getMainInfo() {
const paramsSearch = {
engine: "walmart",
query: "laptop",
};
const data = await promisifiedGetJson(paramsSearch);
const organicResults = data.organic_results;
const productResultsPromises = organicResults.map((organicResult) => {
if (!(organicResult.us_item_id || organicResult.upc || organicResult.product_id)) {
return null;
}
const paramsProduct = {
engine: "walmart_product",
product_id: organicResult.us_item_id || organicResult.upc || organicResult.product_id,
};
return promisifiedGetJson(paramsProduct);
});
await Promise.all(productResultsPromises).then((productResults) => {
const fullProductsSpecs = [];
productResults.forEach((el, i) => {
fullProductsSpecs[i] = {
link: el.search_metadata.walmart_product_url,
title: el.product_result.title,
price: el.product_result.price_map.price,
specifications: {},
};
const specificationHighlights = el.product_result?.specification_highlights;
if (specificationHighlights) {
for (let j = 0; j < specificationHighlights.length; j++) {
fullProductsSpecs[i].specifications[`${specificationHighlights[j].display_name}`] = specificationHighlights[j].value;
}
} else productInfo.specifications = "no detailed specifications";
});
console.log(fullProductsSpecs);
});
}
getMainInfo();
Output
{
link: 'https://www.walmart.com/ip/Refurbished-Apple-MacBook-Pro-13-3-Laptop-LED-Intel-i5-3210M-2-5GHz-4GB-500GB-MD101LLA/708321782',
title: 'Refurbished Apple MacBook Pro 13.3 Laptop LED Intel i5 3210M 2.5GHz 4GB 500GB - MD101LLA',
price: 279.97,
specifications: {
'Laptop Computer Type': 'MacBooks',
'Screen Size': '13.3 in',
'RAM Memory': '4 KB',
'Operating System': 'Windows',
'Hard Drive Capacity': '500 KB',
'Battery Life': '10 h'
}
}
Links
Code in the online IDE • SerpApi Playground
Outro
If you want to see how to scrape something using Node.js that I didn't write about yet or you want to see some project made with SerpApi, please write me a message.
Posted on October 25, 2021
Join Our Newsletter. No Spam, Only the good stuff.
Sign up to receive the latest update from our blog.