In my previous post I talked about finding a sturdy scraping framework. In this post I'll be describing how to deploy Selenium Grid and Xvfb on a Linux instance.

For those of you that missed it, here's the link to the script.

Usage

// Boot
var Selenium = require('/path/to/script')
Selenium.boot(function (err) {
  if (err) console.log('Selenium bootup err:', err);
  else console.log('Selenium started!');
});

// Kill
Selenium.kill();

Dependencies

var async = require('async');
var selenium = require('selenium-standalone');
var Xvfb = require('xvfb');
var xvfb = new Xvfb({
  displayNum: 99,
  reuse: true,
  xvfb_args: ['-screen', '0', '2880x1800x24']
});
  1. async - used to step through the commands required to startup Selenium and Xvfb.
  2. selenium-standalone used to programmatically install and launch a selenium standalone server and chrome driver.
  3. xvfb - used to start and stop a X Virtual Frame Buffer. Make sure on your Linux machine you can run Xvfb via the command line, if not, you'll need to install it for your operating system.

Xvfb

First we need a virtual screen for our Chrome instances to open in. I created mine at 2880x1800 pixels so that I could snap large screenshots while debugging.

var xvfb = new Xvfb({
  displayNum: 99,
  reuse: true,
  xvfb_args: ['-screen', '0', '2880x1800x24']
});

Then start the virtual screen as the first step to this whole process.

exports.boot = function (finished) {
  ...
  function (done) {
    xvfb.start(function(err){
      return done(err);
    });
  }
  ...
}

Selenium

After we have our virtual screen available, we need to start up Selenium. The selenium-standalone package installs both Selenium and Chrome Driver to your machine. I also create a selenium.children array that is used to store all the child processes of the Selenium Grid so we can kill them later.

exports.boot = function (finished) {
  ...
  function (done) {
    selenium.children = [];
    selenium.install({
      logger: function (message) { 
        console.log('Selenium install:', message);
      }
    }, function(err){
      return done(err);
    });
  }
  ...
}

Great, we have Selenium installed on our machine, now lets create a Selenium Hub.

exports.boot = function (finished) {
  ...
  function (done) {
    startSelenium(selenium, ['-role', 'hub', '-timeout', '100'], done);
  }
  ...
}

I created a helper startSelenium function to boot the Selenium Hub and Selenium Nodes. The function knows which to boot via the command line args passed in. After booting is complete, I push the process into the selenium.children array to be killed later.

function startSelenium(selenium, args, started) {

  selenium.start({
    seleniumArgs: args
  }, function (err, child) {

    if (err) {
      return started(err);
    }

    child.stderr.on('data', function(data){
      console.log(data.toString());
    });

    selenium.children.push(child);

    return started(null, selenium);

  });

}

Finally, now that we have a Selenium Hub, we need to create some Selenium Nodes to run our scraping jobs.

exports.boot = function (finished) {
  ...
  function (selenium, done) {

    // Ideal amount of Selenium Grid nodes on an Amazon m3.large instance
    var maxNodes = 6;

    async.times(maxNodes, function(n, next){

      var args = ['-role', 'node', '-hub', 'http://localhost:4444/grid/register', '-maxSession', '1'];
      var port = '555' + n;
      args.push('-port');
      args.push(port);

      startSelenium(selenium, args, next);

    }, function(){
      return done(null, selenium);
    });

  }
  ...
}

The maxNodes variable specifies how many Selenium nodes to create. I found six to be the ideal number for a m3.large Amazon instance as described in my next post. The -maxSession argument specifies that only one job can run at a time per node. I found it better to have multiple nodes running one job each, instead of less nodes with more concurrent jobs each. Selenium does not work well when running concurrent tests per node. Lots of memory leak issues.

Termination

To terminate our Selenium process, we need to kill the hub, nodes, and Xvfb.

function kill() {

  if (selenium && selenium.children) {

    for (var i = 0; i < selenium.children.length; i++) {
      console.log('Killing selenium with PID:', selenium.children[i].pid);
      selenium.children[i].kill();
    };

  }

  if (xvfb) {
    console.log('Killing xvfb');
    xvfb.stopSync();
  }

};

In addition, whenever the parent node process stops for whatever reason, we want to kill the Selenium process. If this is not done, then Selenium and Xvfb have to be manually terminated via the command line before you can restart your process.

process.on('SIGINT', function(){
  kill();
});

process.on('SIGTERM', function(){
  kill();
});

There you have it. This script should work locally and in production environments to bootup a Selenium Grid. Just make sure you can run Xvfb via your command line, and all should be well. Next up, I'll describe my experience of creating and running multiple concurrent scraping jobs.

Write your comment…

Be the first one to comment