How to efficiently store JSON file objects into MongoDB using NodeJs stream?

Considering the file size can cross GB, here what I have done.

Create a large JSON file using the following code (which outputs 198 mb file)


const fs = require('fs');

const arrayOfUsers = [];

for (let i = 0; i < 1e6; i += 1) {
  arrayOfUsers.push({
    firstName: 'Barb',
    lastName: 'E. Riser',
    status: 'unregistered',
    updatedOn: '2017-01-17T13:24:51.403Z',
    createdOn: '2017-01-17T13:24:51.403Z',
    googleLocation: {
      loc: {
        coordinates: [null, null],
      },
    },
  });
}

// write to file
fs.writeFile(`${__dirname}/largeUsers.json`, JSON.stringify(arrayOfUsers), (err) => {
  if (err) {
    console.log('Error occured, exiting...');
    process.exit(-1);
  }

  console.log('Write successful, exiting...');
  process.exit(0);
});

Now I am using this same file to store it's content in MongoDB

const fs = require('fs');
const mongoose = require('mongoose');
const JSONStream = require('JSONStream');
const User = require('./models/User');

const startTime = Date.now();

const databaseURL = 'mongodb://127.0.0.1/dextra';
mongoose.connect(databaseURL);
mongoose.Promise = global.Promise;

const db = mongoose.connection;

db.on('open', () => {
  console.log('Connected to mongo server.\nImport from file to DB started...');
  const dataStreamFromFile = fs.createReadStream(`${__dirname}/largeUsers.json`);

  dataStreamFromFile.pipe(JSONStream.parse('*')).on('data', (chunk) => {
    new User(chunk).save();
  });

  dataStreamFromFile.on('end', () => {
    const timeTaken = Date.now() - startTime;
    console.log(`Import completed in ${timeTaken} milisecs, closing connection...`);
    db.close();
    process.exit(0);
  });
});

db.on('error', (err) => {
  console.error('MongoDB connection error:', err);
  process.exit(-1);
});

The issues I am facing are as follows:

  • If I am not using any transformation streams I am not guaranteed a complete array of objects
  • JSONStream is parsing each object inside the json file chunk and it is taking a long time
  • How to manage MongoDB connections using mongoose
  • How to decrease the latency
  • How to create a batch from the chunk (any ways I can do that using any other transform streams) and then insert that batch of objects say 500 or 1000 in a batch?

Comments (4)

Ashok Dey's photo

So I came up with a very simpe code snippet which is as follow:

const fs = require('fs');
const mongoose = require('mongoose');
const JSONStream = require('JSONStream');
const User = require('./src/models/User');
const config = require('./config.json');

mongoose.connect(config.MONGODB_URI, { poolSize: config.DB_POOL_SIZE });
mongoose.Promise = global.Promise;

const db = mongoose.connection;
let arrayOfUsers = [];

db.on('open', () => {
  console.log('Connected to mongo server.\n');
  process.stdout.write('Processing.');
  const dataStreamFromFile = fs.createReadStream(`${__dirname}/users_large.json`);
  dataStreamFromFile.pipe(JSONStream.parse('*')).on('data', async (userData) => {
    arrayOfUsers.push(userData);
    if (arrayOfUsers.length === config.BATCH_INSERT_VALUE) {
      dataStreamFromFile.pause();
      await User.insertMany(arrayOfUsers);
      arrayOfUsers = [];
      process.stdout.write('.');
      dataStreamFromFile.resume();
    }
  });

  dataStreamFromFile.on('end', async () => {
    await User.insertMany(arrayOfUsers); // left over data
    console.log('\nImport complete, closing connection...');
    db.close();
    process.exit(0);
  });
});

db.on('error', (err) => {
  console.error('MongoDB connection error: ', err);
  process.exit(-1);
});

Note: Inserting 50k records is also a bit slow. Will keep on trying !

Show +1 replies
Ashok Dey's photo

Thanks! I changed the value of BATCH_INSERT_VALUE from 50000 to 1000 and it was faster. Just read the limit of insertMany() is 100

raj rawat's photo

Hi could you please share git repo for the running code Ashok Dey