Web scraping Google Trends with Nodejs
Mikhail Zub
Posted on September 4, 2022
Full code
If you don't need an explanation, have a look at the full code example in the online IDE
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const searchQueries = ["Mercedes"]; // what we want to search (for interestOverTime, interestByRegion, relatedQueries, relatedTopics)
// const searchQueries = ["Mercedes", "BMW", "Audi"]; // what we want to search (for interestOverTime, comparedByRegion, interestByRegion, relatedQueries)
const URL = `https://trends.google.com/trends/explore?q=${encodeURI(searchQueries.join(","))}&hl=en`;
async function getGoogleTrendsResults() {
const browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
await page.waitForTimeout(5000);
await page.reload();
const interestOverTime = {};
const comparedByRegion = [];
const interestByRegion = [];
const relatedQueries = [];
const relatedTopics = {};
const valuePattern = /%22value%22:%22(?<value>[^%]+)/gm; //https://regex101.com/r/PNcP1u/1
page.on("response", async (response) => {
if (response.headers()["content-type"]?.includes("application/")) {
const responseData = await response.text();
const responseURL = await response.url();
if (responseURL.includes("widgetdata/multiline?")) {
const parsedData = JSON.parse(responseData.slice(6))?.default;
interestOverTime.timelineData = parsedData?.timelineData?.map((dataEl) => ({
date: decodeURI(dataEl.formattedTime),
values: searchQueries.map((queryEl, i) => ({
query: queryEl,
value: dataEl.formattedValue[i],
extractedValue: dataEl.value[i],
})),
}));
interestOverTime.averages = parsedData.averages.map((dataEl, i) => ({
query: searchQueries[i],
value: dataEl,
}));
} else {
const values = [...responseURL.matchAll(valuePattern)].map(({ groups }) => groups.value);
if (responseURL.includes("widgetdata/comparedgeo?")) {
if (values.length > 1) {
const parsedData = JSON.parse(responseData.slice(6))?.default;
comparedByRegion.push(
...parsedData.geoMapData.map((dataEl) => ({
geo: dataEl.geoCode,
location: dataEl.geoName,
maxValueIndex: dataEl.maxValueIndex,
values: searchQueries.map((queryEl, i) => ({
query: queryEl,
value: dataEl.formattedValue[i],
extractedValue: dataEl.value[i],
})),
}))
);
} else {
for (query of searchQueries) {
if (values[0] === query) {
const parsedData = JSON.parse(responseData.slice(6))?.default;
interestByRegion.push({
query,
data: parsedData.geoMapData.map((dataEl) => ({
geo: dataEl.geoCode,
location: dataEl.geoName,
maxValueIndex: dataEl.maxValueIndex,
value: dataEl.formattedValue[0],
extractedValue: dataEl.value[0],
})),
});
}
}
}
} else if (responseURL.includes("widgetdata/relatedsearches?")) {
for (query of searchQueries) {
if (values[0] === query) {
if (responseURL.includes("%22keywordType%22:%22ENTITY%22")) {
const parsedData = JSON.parse(responseData.slice(6))?.default;
relatedTopics.top = parsedData.rankedList[0].rankedKeyword.map((dataEl) => ({
topic: {
title: dataEl.topic.title,
type: dataEl.topic.type,
},
value: dataEl.formattedValue,
extractedValue: dataEl.value,
link: "https://trends.google.com" + dataEl.link,
}));
relatedTopics.rising = parsedData.rankedList[1].rankedKeyword.map((dataEl) => ({
topic: {
title: dataEl.topic.title,
type: dataEl.topic.type,
},
value: dataEl.formattedValue,
extractedValue: dataEl.value,
link: "https://trends.google.com" + dataEl.link,
}));
} else {
const parsedData = JSON.parse(responseData.slice(6))?.default;
relatedQueries.push({
searchQuery: query,
top: parsedData.rankedList[0].rankedKeyword.map((dataEl) => ({
query: dataEl.query,
value: dataEl.formattedValue,
extractedValue: dataEl.value,
link: "https://trends.google.com" + dataEl.link,
})),
rising: parsedData.rankedList[1].rankedKeyword.map((dataEl) => ({
query: dataEl.query,
value: dataEl.formattedValue,
extractedValue: dataEl.value,
link: "https://trends.google.com" + dataEl.link,
})),
});
}
}
}
}
}
}
});
await page.waitForTimeout(10000);
await browser.close();
return { interestOverTime, comparedByRegion, interestByRegion, relatedQueries, relatedTopics };
}
getGoogleTrendsResults().then((result) => console.dir(result, { depth: null }));
Preparation
First, we need to create a Node.js* project and add npm
packages puppeteer
, puppeteer-extra
and puppeteer-extra-plugin-stealth
to control Chromium (or Chrome, or Firefox, but now we work only with Chromium which is used by default) over the DevTools Protocol in headless or non-headless mode.
To do this, in the directory with our project, open the command line and enter npm init -y
, and then npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth
.
*If you don't have Node.js installed, you can download it from nodejs.org and follow the installation documentation.
๐Note: also, you can use puppeteer
without any extensions, but I strongly recommended use it with puppeteer-extra
with puppeteer-extra-plugin-stealth
to prevent website detection that you are using headless Chromium or that you are using web driver. You can check it on Chrome headless tests website. The screenshot below shows you a difference.
Process
In this case, we need to intercept all responses when loading the page and get all the required data. The Gif below illustrates that approach:
Code explanation
Declare puppeteer
to control Chromium browser from puppeteer-extra
library and StealthPlugin
to prevent website detection that you are using web driver from puppeteer-extra-plugin-stealth
library:
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
Next, we "say" to puppeteer
use StealthPlugin
, write search queries and the search URL. I write two different search queries to show you all available results (some of them are available only for a single search query, but some are available only for multiple search queries):
const searchQueries = ["Mercedes"];
// const searchQueries = ["Mercedes", "BMW", "Audi"];
const URL = `https://trends.google.com/trends/explore?q=${encodeURI(searchQueries.join(","))}&hl=en`;
Next, write a function to control the browser, and get information:
async function getGoogleTrendsResults() {
...
}
In this function first we need to define browser
using puppeteer.launch({options})
method with current options
, such as headless: false
and args: ["--no-sandbox", "--disable-setuid-sandbox"]
.
These options mean that we use headless mode and array with arguments which we use to allow the launch of the browser process in the online IDE. And then we open a new page
:
const browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
Next, we change default (30 sec) time for waiting for selectors to 60000 ms (1 min) for slow internet connection with .setDefaultNavigationTimeout()
method, go to URL
with .goto()
method and use .waitForTimeout()
method to wait 5 seconds:
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
await page.waitForTimeout(5000);
Because google possibly blocks direct search requests we need to reload the page:
await page.reload();
Then, we define arrays and objects with the results and write valuePattern
to find the correct responses from all of them:
const interestOverTime = {};
const comparedByRegion = [];
const interestByRegion = [];
const relatedQueries = [];
const relatedTopics = {};
const valuePattern = /%22value%22:%22(?<value>[^%]+)/gm;
Next, we right response intercept function:
page.on("response", async (response) => {
...
})
In this function first, we check that the response header content-type
have the "application/"
part. And if its true
we define and write responseData
with all data comes from current response and responseUrl
with URL:
if (response.headers()["content-type"]?.includes("application/")) {
const responseData = await response.text();
const responseURL = await response.url();
...
}
Next, we need to separate response with interestOverTime
data from all other:
if (responseURL.includes("widgetdata/multiline?")) {
...
} else {
...
}
If response with interestOverTime
data, we need to get the correct data to work with it. To do this we remove the first six chars ()]}',
and the new line /n
) from responseData
(.slice(6)
method), parse received JSON string to JS object (JSON.parse()
method) and get data from default
key:
const parsedData = JSON.parse(responseData.slice(6))?.default;
Next, we build the timelineData
array from our parsed response data with objects containing date
and values
keys and the averages
array with objects containing query
and value
keys (to get the correct date
we need to decode the URI encoded string):
interestOverTime.timelineData = parsedData?.timelineData?.map((dataEl) => ({
date: decodeURI(dataEl.formattedTime),
values: searchQueries.map((queryEl, i) => ({
query: queryEl,
value: dataEl.formattedValue[i],
extractedValue: dataEl.value[i],
})),
}));
interestOverTime.averages = parsedData.averages.map((dataEl, i) => ({
query: searchQueries[i],
value: dataEl,
}));
Next, to get all other data we need to get values
from responseURL
using the matchAll
method and the spread syntax([...]
):
const values = [...responseURL.matchAll(valuePattern)].map(({ groups }) => groups.value);
Next, to get comparedByRegion
data we need a response in which values
array length is more than one:
if (values.length > 1) {
const parsedData = JSON.parse(responseData.slice(6))?.default;
comparedByRegion.push(
...parsedData.geoMapData.map((dataEl) => ({
geo: dataEl.geoCode,
location: dataEl.geoName,
maxValueIndex: dataEl.maxValueIndex,
values: searchQueries.map((queryEl, i) => ({
query: queryEl,
value: dataEl.formattedValue[i],
extractedValue: dataEl.value[i],
})),
}))
);
}
Otherwise, we can get interestByRegion
data. To do this, we need to iterate searchQueries
array with the for...of
loop. In the loop, we compare the single value values
with query
. If it's equal to each other, we can fill the interestByRegion
array by pushing results from parsedData
:
else {
for (query of searchQueries) {
if (values[0] === query) {
const parsedData = JSON.parse(responseData.slice(6))?.default;
interestByRegion.push({
query,
data: parsedData.geoMapData.map((dataEl) => ({
geo: dataEl.geoCode,
location: dataEl.geoName,
maxValueIndex: dataEl.maxValueIndex,
value: dataEl.formattedValue[0],
extractedValue: dataEl.value[0],
})),
});
}
}
}
Next, to get relatedTopics
and relatedQueries
data we need a response with ""widgetdata/relatedsearches?""
part in the responseURL
. If it's true, we need to iterate over searchQueries
and compare the single value values
with query
again:
for (query of searchQueries) {
if (values[0] === query) {
...
}
}
Next, we check if responseURL
contains "%22keywordType%22:%22ENTITY%22"
part, we can get relatedTopics
data:
if (responseURL.includes("%22keywordType%22:%22ENTITY%22")) {
const parsedData = JSON.parse(responseData.slice(6))?.default;
relatedTopics.top = parsedData.rankedList[0].rankedKeyword.map((dataEl) => ({
topic: {
title: dataEl.topic.title,
type: dataEl.topic.type,
},
value: dataEl.formattedValue,
extractedValue: dataEl.value,
link: "https://trends.google.com" + dataEl.link,
}));
relatedTopics.rising = parsedData.rankedList[1].rankedKeyword.map((dataEl) => ({
topic: {
title: dataEl.topic.title,
type: dataEl.topic.type,
},
value: dataEl.formattedValue,
extractedValue: dataEl.value,
link: "https://trends.google.com" + dataEl.link,
}));
}
Otherwise, we can get relatedQueries
data:
else {
const parsedData = JSON.parse(responseData.slice(6))?.default;
relatedQueries.push({
searchQuery: query,
top: parsedData.rankedList[0].rankedKeyword.map((dataEl) => ({
query: dataEl.query,
value: dataEl.formattedValue,
extractedValue: dataEl.value,
link: "https://trends.google.com" + dataEl.link,
})),
rising: parsedData.rankedList[1].rankedKeyword.map((dataEl) => ({
query: dataEl.query,
value: dataEl.formattedValue,
extractedValue: dataEl.value,
link: "https://trends.google.com" + dataEl.link,
})),
});
}
And finally, we wait 10 seconds (wait for all responses to be finished) and close the browser and return the received data:
await page.waitForTimeout(10000);
await browser.close();
return { interestOverTime, comparedByRegion, interestByRegion, relatedQueries, relatedTopics };
Now we can launch our parser:
$ node YOUR_FILE_NAME # YOUR_FILE_NAME is the name of your .js file
Output
๐Note: I've combined the results of different runs of our parser into one for convenience.
{
"interestOverTime":{
"timelineData":[
{
"date":"Aug 29 โ Sep 4, 2021",
"values":[
{
"query":"Mercedes",
"value":"74",
"extractedValue":74
},
{
"query":"BMW",
"value":"100",
"extractedValue":100
},
{
"query":"Audi",
"value":"60",
"extractedValue":60
}
]
},
... and other dates
],
"averages":[
{
"query":"Mercedes",
"value":65
},
{
"query":"BMW",
"value":89
},
{
"query":"Audi",
"value":55
}
]
},
"comparedByRegion":[
{
"geo":"LT",
"location":"Lithuania",
"maxValueIndex":1,
"values":[
{
"query":"Mercedes",
"value":"13%",
"extractedValue":13
},
{
"query":"BMW",
"value":"52%",
"extractedValue":52
},
{
"query":"Audi",
"value":"35%",
"extractedValue":35
}
]
},
... and other regions
],
"interestByRegion": {
"query":"Mercedes",
"data":[
{
"geo":"GR",
"location":"Greece",
"maxValueIndex":0,
"value":"36",
"extractedValue":36
},
... and other regions
],
"query":"BMW",
"data":[
{
"geo":"GR",
"location":"Greece",
"maxValueIndex":0,
"value":"37",
"extractedValue":37
},
... and other regions
],
"query":"Audi",
"data":[
{
"geo":"XK",
"location":"Kosovo",
"maxValueIndex":0,
"value":"100",
"extractedValue":100
},
... and other regions
]
},
"relatedQueries":[
{
"searchQuery":"Audi",
"top":[
{
"query":"a4 audi",
"value":"100",
"extractedValue":100,
"link":"https://trends.google.com/trends/explore?q=a4+audi&date=today+12-m"
},
... and other queries
],
"rising":[
{
"query":"wheel of fortune contestant loses audi",
"value":"+2,800%",
"extractedValue":2800,
"link":"https://trends.google.com/trends/explore?q=wheel+of+fortune+contestant+loses+audi&date=today+12-m"
},
... and other queries
]
},
{
"searchQuery":"BMW",
"top":[
{
"query":"bmw price",
"value":"100",
"extractedValue":100,
"link":"https://trends.google.com/trends/explore?q=bmw+price&date=today+12-m"
},
... and other queries
],
"rising":[
{
"query":"bmw x6 2022",
"value":"+1,450%",
"extractedValue":1450,
"link":"https://trends.google.com/trends/explore?q=bmw+x6+2022&date=today+12-m"
},
... and other queries
]
},
{
"searchQuery":"Mercedes",
"top":[
{
"query":"mercedes benz",
"value":"100",
"extractedValue":100,
"link":"https://trends.google.com/trends/explore?q=mercedes+benz&date=today+12-m"
},
... and other queries
],
"rising":[
{
"query":"mercedes f1 2022",
"value":"+2,250%",
"extractedValue":2250,
"link":"https://trends.google.com/trends/explore?q=mercedes+f1+2022&date=today+12-m"
},
... and other queries
]
}
],
"relatedTopics":{
"top":[
{
"topic":{
"title":"Mercedes-Benz",
"type":"Luxury vehicles company"
},
"value":"100",
"extractedValue":100,
"link":"https://trends.google.com/trends/explore?q=/m/052mx&date=today+12-m"
},
...and other topics
],
"rising":[
{
"topic":{
"title":"Mercedes-Benz EQB",
"type":"SUV"
},
"value":"+700%",
"extractedValue":700,
"link":"https://trends.google.com/trends/explore?q=/g/11h__y1vw4&date=today+12-m"
},
...and other topics
]
}
}
Using Google Trends API
The difference is that you don't need to use browser automation to scrape results, and write the parser from scratch and maintain it, which saves a lot of time.
There's also a chance that the request might be blocked at some point from Google. Instead, you just need to iterate the structured JSON and get the data you want.
First, we need to install google-search-results-nodejs
. To do this you need to enter in your console: npm i google-search-results-nodejs
Here's the full code example, if you don't need an explanation:
require("dotenv").config();
const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your API key from serpapi.com
const searchQueries = "Mercedes"; // what we want to search (for interestOverTime, interestByRegion, relatedQueries, relatedTopics)
// const searchQueries = "Mercedes, BMW, Audi"; // what we want to search (for interestOverTime, comparedByRegion)
const dataTypesMultiple = ["TIMESERIES", "GEO_MAP"];
const dataTypesSingle = ["TIMESERIES", "GEO_MAP_0", "RELATED_TOPICS", "RELATED_QUERIES"];
const params = {
engine: "google_trends", // search engine
q: searchQueries, // search query
};
const getJson = () => {
return new Promise((resolve) => {
search.json(params, resolve);
});
};
const getResults = async () => {
const trendsResults = {};
if (searchQueries.split(",").length > 1) {
for (type of dataTypesMultiple) {
params.data_type = type;
const searchResult = await getJson();
if (type === "TIMESERIES") trendsResults.interestOverTime = searchResult.interest_over_time;
else if (type === "GEO_MAP") trendsResults.comparedByRegion = searchResult.compared_breakdown_by_region;
}
} else {
for (type of dataTypesSingle) {
params.data_type = type;
const searchResult = await getJson();
if (type === "TIMESERIES") trendsResults.interestOverTime = searchResult.interest_over_time;
else if (type === "GEO_MAP_0") trendsResults.interestByRegion = searchResult.interest_by_region;
else if (type === "RELATED_TOPICS") trendsResults.relatedTopics = searchResult.related_topics;
else if (type === "RELATED_QUERIES") trendsResults.relatedQueries = searchResult.related_queries;
}
}
return trendsResults;
};
getResults().then((result) => console.dir(result, { depth: null }));
Code explanation
First, we need to declare SerpApi
from google-search-results-nodejs
library and define new search
instance with your API key from SerpApi:
const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(API_KEY);
Next, we write two different search queries to show you all available results (some of them are available only for a single search query, but some are available only for multiple search queries) and the necessary parameters for making a request :
const searchQueries = "Mercedes";
// const searchQueries = "Mercedes, BMW, Audi";
const dataTypesMultiple = ["TIMESERIES", "GEO_MAP"];
const dataTypesSingle = ["TIMESERIES", "GEO_MAP_0", "RELATED_TOPICS", "RELATED_QUERIES"];
const params = {
engine: "google_trends",
q: searchQueries,
};
Next, we wrap the search method from the SerpApi library in a promise to further work with the search results:
const getJson = () => {
return new Promise((resolve) => {
search.json(params, resolve);
});
};
And finally, we declare the function getResult
that gets data from the page and return it:
const getResults = async () => {
...
};
In this function first, we declare an object trendsResults
with results data:
const trendsResults = {};
Next, we need to define if searchQueries
has one or more values. If it has more than one, we can get interestOverTime
and comparedByRegion
data. To do this, we need to iterate over dataTypesMultiple
array (with the for...of
loop) and set data_type
value in the params object. Then we just receive searchResult
and select the necessary data:
if (searchQueries.split(",").length > 1) {
for (type of dataTypesMultiple) {
params.data_type = type;
const searchResult = await getJson();
if (type === "TIMESERIES") trendsResults.interestOverTime = searchResult.interest_over_time;
else if (type === "GEO_MAP") trendsResults.comparedByRegion = searchResult.compared_breakdown_by_region;
}
}
Otherwise, we iterate over dataTypesSingle
array and get all data from the single query search request:
else {
for (type of dataTypesSingle) {
params.data_type = type;
const searchResult = await getJson();
if (type === "TIMESERIES") trendsResults.interestOverTime = searchResult.interest_over_time;
else if (type === "GEO_MAP_0") trendsResults.interestByRegion = searchResult.interest_by_region;
else if (type === "RELATED_TOPICS") trendsResults.relatedTopics = searchResult.related_topics;
else if (type === "RELATED_QUERIES") trendsResults.relatedQueries = searchResult.related_queries;
}
}
After, we run the getResults
function and print all the received information in the console with the console.dir
method, which allows you to use an object with the necessary parameters to change default output options. Watch Node.js documentation for more info:
getResults().then((result) => console.dir(result, { depth: null }));
Output
๐Note: I've combined the results of different runs of our parser into one for convenience.
{
"interestOverTime":{
"timeline_data":[
{
"date":"Aug 29 โ Sep 4, 2021",
"values":[
{
"query":"Mercedes",
"value":"74",
"extracted_value":74
},
{
"query":"BMW",
"value":"100",
"extracted_value":100
},
{
"query":"Audi",
"value":"60",
"extracted_value":60
}
]
},
... and other dates
],
"averages":[
{
"query":"Mercedes",
"value":66
},
{
"query":"BMW",
"value":90
},
{
"query":"Audi",
"value":55
}
]
},
"comparedByRegion":[
{
"geo":"LT",
"location":"Lithuania",
"max_value_index":1,
"values":[
{
"query":"Mercedes",
"value":"13%",
"extracted_value":13
},
{
"query":"BMW",
"value":"52%",
"extracted_value":52
},
{
"query":"Audi",
"value":"35%",
"extracted_value":35
}
]
},
...and other regions
],
"interestByRegion":[
{
"geo":"AL",
"location":"Albania",
"max_value_index":0,
"value":"100",
"extracted_value":100
},
...and other regions
],
"relatedTopics":{
"rising":[
{
"topic":{
"value":"/g/11h__y1vw4",
"title":"Mercedes-Benz EQB",
"type":"SUV"
},
"value":"+700%",
"extracted_value":700,
"link":"https://trends.google.com/trends/explore?q=/g/11h__y1vw4&date=today+12-m",
"serpapi_link":"https://serpapi.com/search.json?data_type=RELATED_TOPICS&date=today+12-m&engine=google_trends&q=%2Fg%2F11h__y1vw4&tz=420"
},
... and other topics
],
"top":[
{
"topic":{
"value":"/m/052mx",
"title":"Mercedes-Benz",
"type":"Luxury vehicles company"
},
"value":"100",
"extracted_value":100,
"link":"https://trends.google.com/trends/explore?q=/m/052mx&date=today+12-m",
"serpapi_link":"https://serpapi.com/search.json?data_type=RELATED_TOPICS&date=today+12-m&engine=google_trends&q=%2Fm%2F052mx&tz=420"
},
... and other topics
]
},
"relatedQueries":{
"rising":[
{
"query":"mercedes eqxx",
"value":"+2,450%",
"extracted_value":2450,
"link":"https://trends.google.com/trends/explore?q=mercedes+eqxx&date=today+12-m",
"serpapi_link":"https://serpapi.com/search.json?data_type=RELATED_QUERIES&date=today+12-m&engine=google_trends&q=mercedes+eqxx&tz=420"
},
... and other queries
],
"top":[
{
"query":"mercedes benz",
"value":"100",
"extracted_value":100,
"link":"https://trends.google.com/trends/explore?q=mercedes+benz&date=today+12-m",
"serpapi_link":"https://serpapi.com/search.json?data_type=RELATED_QUERIES&date=today+12-m&engine=google_trends&q=mercedes+benz&tz=420"
},
... and other queries
]
}
}
Links
If you want to see some projects made with SerpApi, please write me a message.
Add a Feature Request๐ซ or a Bug๐
Posted on September 4, 2022
Join Our Newsletter. No Spam, Only the good stuff.
Sign up to receive the latest update from our blog.