Cleaning up the data sets

The imported data is too cumbersome:

db.sales.findOne()
{
  "_id": ObjectId("56005dd980c3678b19792b7f"),
  "paon": {
    "datatype": "http://www.w3.org/2001/XMLSchema#string",
    "type": "typed-literal",
    "value": "25"
  },
  "street": {
    "datatype": "http://www.w3.org/2001/XMLSchema#string",
    "type": "typed-literal",
    "value": "NORFOLK PARK COTTAGES"
  },
  "town": {
    "datatype": "http://www.w3.org/2001/XMLSchema#string",
    "type": "typed-literal",
    "value": "MAIDENHEAD"
  },
  "county": {
    "datatype": "http://www.w3.org/2001/XMLSchema#string",
    "type": "typed-literal",
    "value": "WINDSOR AND MAIDENHEAD"
  },
  "postcode": {
    "datatype": "http://www.w3.org/2001/XMLSchema#string",
    "type": "typed-literal",
    "value": "SL6 7DR"
  },
  "amount": {
    "datatype": "http://www.w3.org/2001/XMLSchema#integer",
    "type": "typed-literal",
    "value": "9000"
  },
  "date": {
    "datatype": "http://www.w3.org/2001/XMLSchema#date",
    "type": "typed-literal",
    "value": "1996-09-19"
  }
}

A simple $project aggregation refactors the data so that it's simpler to use:

db.sales.aggregate([
  {$project: {
    "address.nameOrNumber": "$paon.value",
    "address.street": "$street.value",
    "address.town": "$town.value",
    "address.county": "$county.value",
    "address.postcode": "$postcode.value",
    amount: "$amount.value",
    date: "$date.value"
  }},
  {$out: "homeSales"}
])

The data is now much cleaner:

db.homeSales.findOne()
{
  "_id": ObjectId("56005dd980c3678b19792b7f"),
  "amount": "9000",
  "date": "1996-09-19",
  "address": {
    "nameOrNumber": "25",
    "street": "NORFOLK PARK COTTAGES",
    "town": "MAIDENHEAD",
    "county": "WINDSOR AND MAIDENHEAD",
    "postcode": "SL6 7DR"
  }
}

But we will want to analyze the amount and date data and they're currently stored as strings. To fix that, run the following commands to covert them to numbers and dates respectively:

db.homeSales.find({amount: {$exists: true}}).forEach(function(obj) { 
    obj.amount = new NumberInt(obj.amount);
    db.homeSales.save(obj);
});

db.homeSales.find({date: {$exists: true}}).forEach(function(obj) { 
    obj.date = new Date(obj.date);
    db.homeSales.save(obj);
});

The nameOrNumber key is a little trickier as it is sometimes a numeric value within the string and sometimes a house name – the following code converts it to a number in every document where the string contains a number:

db.homeSales.find({"address.nameOrNumber": {$exists: true}}).forEach(function(obj) { 
    obj.address.nameOrNumber = new NumberInt(obj.address.nameOrNumber);
    if (obj.address.nameOrNumber != 0) db.homeSales.save(obj);
});

There's a second collection which we'd like to use which maps between postal codes and locations:

db.postcodes.findOne()
{
  "_id": ObjectId("5600521e50fa77da54dfc0d2"),
  "postcode": "SL6  0AA",
  "location": {
    "type": "Point",
    "coordinates": [
      51.525605,
      -0.700974
    ]
  }
}

Close examination of the data reveals that there are 2 spaces between the first and second components of the postcode value which would cause problems when trying to perform a join between these two collections. Again, this is simple to fix with the mongo shell:

db.postcodes.find({postcode: {$exists: true}}).forEach(function(obj) { 
    obj.postcode = (obj.postcode.slice(0, 3) + 
        obj.postcode.slice(4, obj.postcode.length));
    db.postcodes.save(obj);
});

MongoDB Compass can be used to check that we now have the collections in good shape as shown in Figure X.

Visual inspection of collection using MongoDB Compass Figure X: Visual inspection of collection using MongoDB Compass