I had a situation this week, I wanted to read and validate some data from a private Google Spreadsheet using Puppeteer. Initially, I found 2 problems:
- I needed to log in with a custom email/password to have the access to the spreadsheet. A captcha appears if we use a "vanilla" implementation of the puppeteer app.
- The spreadsheet was blocked, we have read-only permission. We can't click, read, modify or make any operation on the cells.
This seems pretty awful, don't you think? Well, let's solve the first topic.
This is how I could bypass the Gmail captcha login and could read the data like a charm:
The Tools
We choose to use 3 extra packages aside puppeteer:
So my package.json looked like this.
{
  "name": "spreadsheet-checker",
  "version": "1.0.0",
  "description": "an google spreadsheet reader",
  "main": "index.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "keywords": [],
  "author": "",
  "license": "MIT",
  "dependencies": {
    "puppeteer": "^8.0.0",
    "puppeteer-extra": "^3.1.18",
    "puppeteer-extra-plugin-adblocker": "^2.11.11",
    "puppeteer-extra-plugin-stealth": "^2.7.6"
  }
}
The Script
To get access to the spreadsheet we need to login first, and then make the redirect to the spreadsheet. So the script will be like this:
const puppeteer = require("puppeteer-extra");
// Add stealth plugin and use defaults (all tricks to hide puppeteer usage)
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
// Add adblocker plugin to block all ads and trackers (saves bandwidth)
const AdblockerPlugin = require("puppeteer-extra-plugin-adblocker");
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
function sleep(ms) {
  return new Promise((resolve) => setTimeout(resolve, ms));
}
(async function () {
  // That's it, the rest is puppeteer usage as normal 
  const browser = await puppeteer.launch({
    headless: false
  });
  const page = await browser.newPage();
  let navigationPromise = page.waitForNavigation();
  await page.goto("https://accounts.google.com/");
  await navigationPromise;
  await page.waitForSelector('input[type="email"]');
  await page.type('input[type="email"]', process.env.email); // Email login
  await page.click("#identifierNext");
  await page.waitForSelector('input[type="password"]', { visible: true });
  await page.type('input[type="password"]', process.env.password); // Password login
  await page.waitForSelector("#passwordNext", { visible: true });
  await page.click("#passwordNext");
  navigationPromise = page.waitForNavigation();
  await navigationPromise;
  await page.goto(process.env.file_url); // Spreadsheet url
  await page.screenshot({ path: "spreadsheet-screen.png", fullPage: true }); // We take a screenshot to have probe of the bypass
  await browser.close();
})();
Now, let's solve the second topic.
The capture
So now we are in, how we can read the data? Well, the best approach with this scenario (read-only spreadsheet) we can download the data by using 2 things:
- Setting the download folder handler for the puppeteer app.
- Using page.keyboard.downandpage.keyboard.pressto trigger the shortcuts to save the file in the format we want it (PDF, CSV, XLSX).
The download handler
We need to bind a local folder to be the download folder for the puppeteer. To do this, we need to import the path package and configure a downloadPath, and then bind the page._client.send("Page.setDownloadBehavior") with a custom configuration.
const path = require("path");
const downloadPath = path.resolve("./download");
// puppeteer-extra is a drop-in replacement for puppeteer,
// it augments the installed puppeteer with plugin functionality.
// Any number of plugins can be added through `puppeteer.use()`
const puppeteer = require("puppeteer-extra");
// Add stealth plugin and use defaults (all tricks to hide puppeteer usage)
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
// Add adblocker plugin to block all ads and trackers (saves bandwidth)
const AdblockerPlugin = require("puppeteer-extra-plugin-adblocker");
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
function sleep(ms) {
  return new Promise((resolve) => setTimeout(resolve, ms));
}
(async function () {
  // That's it, the rest is puppeteer usage as normal 
  const browser = await puppeteer.launch({
    headless: false
  });
  const page = await browser.newPage();
  let navigationPromise = page.waitForNavigation();
  await page.goto("https://accounts.google.com/");
  await navigationPromise;
  await page.waitForSelector('input[type="email"]');
  await page.type('input[type="email"]', process.env.email); // Email login
  await page.click("#identifierNext");
  await page.waitForSelector('input[type="password"]', { visible: true });
  await page.type('input[type="password"]', process.env.password); // Password login
  await page.waitForSelector("#passwordNext", { visible: true });
  await page.click("#passwordNext");
  navigationPromise = page.waitForNavigation();
  await navigationPromise;
  await page.goto(process.env.file_url); // Spreadsheet url
  // Our download configuration
  await page._client.send("Page.setDownloadBehavior", {
    behavior: "allow",
    downloadPath: downloadPath,
  });
  await browser.close();
})();
With this, we are ready to make the download action via shortcuts.
The shortcuts
In this case, I downloaded all the pages via HTML, using the next shortcuts.
ALT + F to open the File tab.
ALT + D to open the Download menu.
ALT + W to select Website option, and donwload all the content as HTML.
The script updated:
const path = require("path");
const downloadPath = path.resolve("./download");
// puppeteer-extra is a drop-in replacement for puppeteer,
// it augments the installed puppeteer with plugin functionality.
// Any number of plugins can be added through `puppeteer.use()`
const puppeteer = require("puppeteer-extra");
// Add stealth plugin and use defaults (all tricks to hide puppeteer usage)
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
// Add adblocker plugin to block all ads and trackers (saves bandwidth)
const AdblockerPlugin = require("puppeteer-extra-plugin-adblocker");
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
function sleep(ms) {
  return new Promise((resolve) => setTimeout(resolve, ms));
}
(async function () {
  // That's it, the rest is puppeteer usage as normal 
  const browser = await puppeteer.launch({
    headless: false
  });
  const page = await browser.newPage();
  let navigationPromise = page.waitForNavigation();
  await page.goto("https://accounts.google.com/");
  await navigationPromise;
  await page.waitForSelector('input[type="email"]');
  await page.type('input[type="email"]', process.env.email); // Email login
  await page.click("#identifierNext");
  await page.waitForSelector('input[type="password"]', { visible: true });
  await page.type('input[type="password"]', process.env.password); // Password login
  await page.waitForSelector("#passwordNext", { visible: true });
  await page.click("#passwordNext");
  navigationPromise = page.waitForNavigation();
  await navigationPromise;
  await page.goto(process.env.file_url); // Spreadsheet url
  await page._client.send("Page.setDownloadBehavior", {
    behavior: "allow",
    downloadPath: downloadPath,
  });
  await page.keyboard.down("Alt");
  await page.keyboard.press("KeyF");
  await page.keyboard.press("KeyD");
  await page.keyboard.press("KeyW");
  await browser.close();
})();
Now we have the data downloaded. Cool! The reading process will be for another post.
Wrap up
This is a simple but useful implementation to solve this kind of problem. Hope you enjoy it.
Happy Hacking!





very nice !!!