Learn Node Part 9: WebScraper

Overview

Initially I wanted to scrape a school site for class schedules using the node modules cheerio and request. It turns out that the site I wanted to scrape uses an iframe for the data… and request doesn’t like those. So I decided to switch to selenium and grunt-mocha-webdriver. You can find more information here.

Installation

Step 1

1
2
3
4
5
6
### Command Line ###

# Get the right packages
npm install --save-dev grunt
npm install --save-dev load-grunt-tasks
npm install --save-dev grunt-mocha-webdriver

Step 2

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
// ### in Gruntfile.js ###

module.exports = function(grunt) {

  require('load-grunt-tasks')(grunt);
  grunt.initConfig({
    mochaWebdriver: {
      options: {
        timeout: 1000 * 60 * 3,
        reporter: 'spec'
      },
      chrome: {
        src: ['scrape/index.js'],
        options: {
          testName: 'scrapping',
          hostname: 'localhost',
          port:   '4444',
          usePromises: true,
          autoInstall: true,
          browsers: [
            {browserName: 'chrome'}
          ]
        }
      }
    }// end mochaWebdriver
  });

  grunt.registerTask('default', ["mochaWebdriver:chrome"]);
};

Step 3

1
2
3
4
5
6
7
8
// ### in scrape/index.js ###

// declare a simple mocha test
describe("Scraping", function() {
  it("should just go to the url and wait", function() {
    // instructions go here
  });
});

Step 4

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
// ### in scrape/index.js ###

var jsCode = "console.log('this will execute in the browser')";
var page = {
  url: "url",
  body: "#body-id",
  submit: "#submit-btn-id"
};

it("should just go to the url", function() {
  this.browser
      .get(page.url)
      .waitForElementByCss(page.body, this.wd.asserters.isDisplayed)
      .execute(jsCode)
      .elementByCss(page.submit)
      .click()
      .sleep(4000);
});

Learn Node Part 9: WebScraper

Overview

Installation

Random Posts