Creating a Document-Store Hybrid in Postgres 9.5

February 04, 2016Nathan Fritz

Document-Stores, Key-Stores, and Relational databases all have different use cases that they work best for, but there is quite a bit of overlap. What if we were to have the advantages of all of these databases in one hybrid database?

Postgresql can store and query JSON, and now has a PUT/UPSERT equivalent with INSERT ... ON CONFLICT ... DO UPDATE so we can easily use Postgresql as a Document-Store like CouchDB as well as being able to take advantage of normalized tables with the same data.

In this blog post, I'll walk you through some postgres wizardry to treat 3rd party data in postgresql like a document-store and show you how to normalize parts of that data automatically.

Getting the Data

NASA has some pretty interesting data in their APIs like information about known asteroids near Earth, but I chose the boring API of Patents because it had some data to normalize, the "innovators" of a patent.

curl -o data.json "https://api.nasa.gov/patents/content?limit=10000&api_key=DEMO_KEY"

If that's not working, get an API key from NASA's API site.

At the top of the JSON file, I currently see a count value of 1324.

{"count": 1324, "results": [
  {
    "category": "materials and coatings",
    "client_record_id": "patent_ARC-14661-3",
    "center": "ARC",
    "eRelations": [],
    "reference_number": "ARC-14661-3",
    "expiration_date": "",
    "abstract": "Method and system for functionalizing a collection ...",
    "title": "Selective functionalization of carbon nanotubes ..."
    "innovator": [
      {
        "lname": "Khare",
        "mname": "N.",
        "company": "SETI Institute"
        "order": "1"
        "fname": "Bishun"
      }, {
        "lname": "Meyyappan",
        "company": "NASA Ames Research Center",
        "order": "2",
        "fname": "Meyya"
      }
    ],
    "contact": {
      "office": "Technology Partnerships Division",
      "facility": "NASA Ames Research Center",
      "email": "Trupti.D.Sanghani@nasa.gov",
      "name": "Trupti D. Sanghani",
      "address": "Mail Stop 202A-3, Moffett Field, CA 94035"
    },
    "publication": null
    "concepts": {
      "1": "Fundamental physics concepts",
      "0": "Metric space",
      "3": "Carbon",
      "2": "Temperature",
      "5": "Carbon nanotube",
      "4": "Gas",
      "7": "Topology",
      "6": "Ionizing radiation"
    },
    "serial_number": "11/387,503",
    "_id": "53f65b3d5904da2c9fc3008f",
    "patent_number": "7767270",
    "id": "patent_ARC-14661-3",
    "trl": "3 - Proof-of-concept"
  },
  ...
]}

CREATE DATABASE nasa_patents;
\connect nasa_patents;

CREATE TABLE patents (
	id TEXT PRIMARY KEY,
	data JSONB,
	created TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT now(),
	updated TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT now()
);

In order to have the updated field work properly, we'll make a stored procedure for it and hook it up with a trigger.

CREATE OR REPLACE FUNCTION update_modified_column()
RETURNS TRIGGER AS $$
BEGIN
    NEW.updated = now();
    RETURN NEW;
END;
$$ language 'plpgsql';

CREATE TRIGGER trigger_patents_updated BEFORE UPDATE ON patents
  FOR EACH ROW EXECUTE PROCEDURE update_modified_column();

This will set the updated column before the update is complete.

Now let's load the data up. Of course, you don't have to use Node.js like I am here. In your language of choice, insert into the patents TABLE for each entry of data.

'use strict';

const data = require('./data.json');
const pg = require('pg-promise')({});

const db = pg('postgres://localhost/nasa_patents');
db.tx((tx) => {
  const queries = [];
  for (let row of data.results) {
    queries.push(tx.query(`INSERT INTO patents (id, data)
      VALUES ($1, $2)
      ON CONFLICT (id) DO UPDATE SET data=$2`, [row._id, row]))
  }
  return tx.batch(queries);
}).catch((err) => {
  console.log(err);
  console.log(err.stack);
  pg.end();
}).then(() => {
  console.log('done');
  pg.end();
});

We've normalized id from entry._id right off the bat. If there's any other data that you'd like normalized in the table, feel free to alter your table and add it to your INSERT. If you wanted to auto-generate ids instead, use the SERIAL type, or a UUID type with an extension.

Notice the ON CONFLICT clause; this makes our INSERT more like a PUT in a document-store, and keeps our data in sync. We can run this as many times as we want and we'll only create as many rows as our data.results has entries.

If we wanted a MERGE behavior instead, meaning that we'd DELETE database entries that aren't in data.json, we'd have to take a more complicated approach.

'use strict';

const data = require('./data.json');
const pg = require('pg-promise')({});

const db = pg('postgres://localhost/nasa_patents');
db.tx((tx) => {
  const queries = [
    tx.none('DROP TABLE IF EXISTS _patents_incoming'),
    tx.none(`CREATE TABLE _patents_incoming (
      id TEXT PRIMARY KEY NOT NULL, data JSONB
    )`)
  ];
  for (let entry of data.results) {
    queries.push(tx.none(`INSERT INTO _patents_incoming (id, data)
      VALUES ($1, $2)`,
      [entry._id, entry]));
  }
  queries.push(tx.tx((tx2) => {
    const queries2 = [
      tx.none(`DELETE FROM patents USING patents AS p1
        LEFT OUTER JOIN _patents_incoming AS p2 ON p1.id=p2.id
        WHERE patents.id=p1.id AND p2.id IS NULL`),
      tx.none(`INSERT INTO patents SELECT * FROM _patents_incoming
        ON CONFLICT (id) DO UPDATE SET data=(
        SELECT data FROM _patents_incoming WHERE id=patents.id)`)
    ];
    return tx2.batch(queries2);
  }));
  return tx.batch(queries);
}).catch((err) => {
  console.log(err);
  console.log(err.stack);
  pg.end();
}).then(() => {
  console.log('done');
  pg.end();
});

In this case, we INSERT the data into a temporary table. We can then LEFT OUTER JOIN from the patents table to our temporary table, deleting any rows where we don't have matching data. From there, we INSERT using a SELECT of our temporary table, replacing the data when the ids conflict.

We could make a stored procedure to do this more cleanly, but we'd have to guarantee our JSON parameter we were passing it was less than 1GB. In fact, in the case of big files, we'd want to load this file asynchronously, parsing the JSON as we go and inserting it, but that's a bit out of scope for this blog post. If you have a better way of doing a MERGE with DELETE with a large JSON array, I'd love to see it.

Simple Get and Put Procedures

These are pretty simple, and maybe not worth doing, but it's a key/document-store right, so we must have Get and Put functions!

CREATE OR REPLACE FUNCTION GetPatent(in_id TEXT)
RETURNS SETOF patents AS $$ BEGIN
	RETURN QUERY SELECT * from patents WHERE id=in_id LIMIT 1;
END;
$$ language 'plpgsql';

select data->>'category' category
  FROM getpatent('53f656f85904da2c9fc2fe6b');
--aeronautics

CREATE OR REPLACE FUNCTION PutPatent(in_data JSON)
RETURNS TEXT AS $$
DECLARE
	r_id TEXT;
BEGIN
	INSERT INTO patents (id, data) VALUES (in_data::JSONB->>'_id',
      in_data::JSONB)
    ON CONFLICT (id) DO UPDATE SET data=in_data::JSONB RETURNING id
      INTO r_id;
	RETURN r_id;
END;
$$ language 'plpgsql';

SELECT PutPatent('{"_id": "ham", "test": "value"}'); -- ham

A fun note here, if you run the MERGE Node.js after this, it should get rid of your "ham" entry because it DELETEs any entries in new data.

The Hybrid Part

But so far, this is just a really basic document-store; I thought you said hybrid?

Oh, right, sorry about that. You may noticed that we're normalizing data->>'_id' out to id when INSERTing. We could make that a bit more transparent with a trigger, so let's add one for category.

ALTER TABLE patents ADD COLUMN category TEXT;

CREATE OR REPLACE FUNCTION normalize_patents_category()
RETURNS TRIGGER AS $$
BEGIN
	NEW.category := NEW.data->>'category';
	RETURN NEW;
END;
$$ language 'plpgsql';

CREATE TRIGGER trigger_patents_normalize_category
  BEFORE INSERT OR UPDATE ON patents FOR EACH ROW
  EXECUTE PROCEDURE normalize_patents_category();

Now you can re-run your merge or import scripts, and you'll have your category data expanded out. But this isn't very useful, is it? I mean, that data wasn't hard to deal with anyway. Let's do something more interesting.

Something More Interesting

In each patent is a list of innovators. Let's break them out into their own table, and keep it in sync. It's likely that some innovators may have more than one patent, so let's make a trigger that keeps a list of unique patent innovators.

CREATE TABLE innovators (
	fname TEXT,
	lname TEXT,
	mname TEXT,
	company TEXT,
	CONSTRAINT innovators_pkey
      PRIMARY KEY (fname, lname, mname, company)
);

CREATE OR REPLACE FUNCTION patents_to_innovators()
RETURNS TRIGGER AS $$
DECLARE
	nr innovators%rowtype;
BEGIN
	FOR nr IN
		SELECT (jsonb_populate_recordset(null::innovators,
          NEW.data->'innovator')).*
	LOOP
		nr.fname := COALESCE(nr.fname, '');
		nr.lname := COALESCE(nr.lname, '');
		nr.mname := COALESCE(nr.mname, '');
		nr.company := COALESCE(nr.company, '');
		INSERT INTO innovators (fname, lname, mname, company)
        (SELECT nr.*)
			ON CONFLICT ON CONSTRAINT innovators_pkey DO NOTHING;
	END LOOP;
    RETURN NEW;
END;
$$ LANGUAGE plpgsql;

CREATE TRIGGER trigger_patents_to_innovators
  AFTER INSERT OR UPDATE ON patents
  FOR EACH ROW EXECUTE PROCEDURE patents_to_innovators();

Now if you run your merge or import script, it'll populate the list of unique names. You may notice that all of the NEW.data->'innovator' fields are COALESCEd with empty strings to prevent NULL values. You might think NULLs wouldn't cause a problem (I thought that too), but in Postgresql NULL = NULL is never true, so the CONFLICT won't happen on or against rows with NULL and so we'd get duplicate entries. I COALESCE in the the INSERT's SELECT, but for the purpose of a blog post, showing it separately is more readable.

So now we have nice, normalized data for innovators. We could maybe make a SERIAL id for our primary key, and make a join table that references patents if we wanted to.

Another Use Case

The scenario that inspired this post was a 3rd party JSON feed of objects, each with child ids that refer to root level objects. In essence I had a flat array of objects, like our first example, but another view of the data was that of a tree.

Making a dependency flattened dependence tree for a specific object was straight forward with a CTE:

WITH RECURSIVE dtree(id, data) AS (
  SELECT id, data FROM objects WHERE id='some_id'
    -- specify the object to start with
UNION ALL
  SELECT modules.id, modules.data FROM modules
  JOIN dtree ON object.id=dtree.id
)
SELECT * FROM dtree;

But going the other direction, finding all of the objects that have a direct or indirect requirement of a specific object id in their dependency tree was harder. My first attempt at this involved a query that expanded out each object's dependency tree as above, and looked for instances of the object in question in that tree. This proved to be way too slow; I needed to normalize the data.

You'll notice it looks similar to how we normalized innovators.

CREATE TABLE dependencies (
  id INTEGER SERIAL,
  object_id TEXT,
  sub_dependency_id TEXT,
  CONSTRAINT dependencies_dupes
    UNIQUE (object_id, sub_dependency_id)
);

CREATE OR REPLACE FUNCTION objects_to_dependencies()
RETURNS TRIGGER AS $$
DECLARE
	nr dependencies%ROWTYPE;
BEGIN
	FOR object_cte IN
		SELECT jsonb_array_elements_text(NEW.data->'dependencies')
      AS sub_dependency_id,
      NEW.data->>'id' AS object_id
	LOOP
		INSERT INTO depenedencies (object_id, sub_dependency_id)
          (SELECT object_cte.*)
          ON CONFLICT ON CONSTRAINT dependencies_dupes DO NOTHING;
	END LOOP;
  RETURN NEW;
END;
$$ LANGUAGE plpgsql;

CREATE TRIGGER trigger_patents_to_innovators
  AFTER INSERT OR UPDATE ON patents
  FOR EACH ROW EXECUTE PROCEDURE patents_to_innovators();

Now that all of the data is normalized (at least the next time I sync, which I can do manually with an empty update), I can go reverse through the tree to find which objects use another object somewhere in their dependency tree.

WITH RECURSIVE  dtree(object_id, data, sub_dependency_id) AS (
  SELECT dependencies.object_id, objects.data,
    dependencies.sub_depedency_id
    FROM dependencies
    JOIN objects ON object.id=dependencies.object_id
    WHERE dependencies.sub_dependency_id='some_id'
UNION ALL
  SELECT objects.object_id, objects.data,
    dependencies.sub_dependency_id as dependency
    FROM dependencies JOIN dtree parent
      ON parent.object_id=dependencies.sub_dependency_id
    JOIN object ON object.object_id=dependencies.object_id
) SELECT object_id, data FROM dependencies;

This runs quite a bit faster than having to expand out the dependency tree of every object and check for the existence of the object I'm searching for as a dependency.

Bonus Example

Since we're using Postgresql as a document-store and relational database hybrid, we can do cool things, like joining two documents to each other. This was rather difficult in Postgresql 9.4, as there wasn't a way to modify a JSON(B) value in place. Now we can add fields and remove fields with the operators added in 9.5.

CREATE TABLE authors (id INTEGER PRIMARY KEY, data JSONB);
CREATE TABLE books (id INTEGER PRIMARY KEY, data JSONB, author_id
  INTEGER REFERENCES authors (id));

INSERT INTO authors (id, data) VALUES
(1, '{"first_name": "Mike", "last_name": "Speegle",
  "dob": "1979-02-03"}'::JSONB),
(2, '{"first_name": "Patrick", "last_name": "Rothfuss",
  "dob": "1973-06-06"}'::JSONB);

INSERT INTO books (id, author_id, data) VALUES
(1, 2, '{"title": "The Name of the Wind",
  "ISBN": "978-0-7564-0407-9"}'::JSONB),
(2, 2, '{"title": "Wise Man''s Fear",
  "ISBN": "978-0-7564-0473-4"}'::JSONB),
(3, 1, '{"title": "Something Greater Than Artifice",
  "ISBN": "978-0-692-26050-0"}'::JSONB),
(4, 1, '{"title": "Pen and Platen: Short Stories Written the Long Way",
  "ISBN": "978-0615574165"}'::JSONB),
(5, 2, '{"title": "A Slow Regard of Silent Things",
  "ISBN": "978-0-7564-1043-8"}'::JSONB);


WITH author_books_cte AS (
 	SELECT authors.data AS author_data, json_agg(books.data) as books_data
 		FROM authors
 		LEFT JOIN books ON books.author_id=authors.id GROUP BY authors.id
 )
 SELECT jsonb_pretty(author_data || jsonb_build_object('books', books_data))
  AS authors
 	FROM author_books_cte;

                                  authors
---------------------------------------------------------------------
 {                                                                  
     "dob": "1979-02-03",                                           
     "books": [                                                     
         {                                                          
             "ISBN": "978-0-692-26050-0",                           
             "title": "Something Greater Than Artifice"             
         },                                                         
         {                                                          
             "ISBN": "978-0615574165",                              
             "title": "Pen and Platen: Short Stories Written the
                Long Way"
         }                                                          
     ],                                                             
     "last_name": "Speegle",                                        
     "first_name": "Mike"                                           
 }
 {                                                                  
     "dob": "1973-06-06",                                           
     "books": [                                                     
         {                                                          
             "ISBN": "978-0-7564-0407-9",                           
             "title": "The Name of the Wind"                        
         },                                                     
         {                                                      
             "ISBN": "978-0-7564-0473-4",                       
             "title": "Wise Man's Fear"                         
         },                                                 
         {                                                      
             "ISBN": "978-0-7564-1043-8",                       
             "title": "A Slow Regard of Silent Things"          
         }                                                      
     ],                                                         
     "last_name": "Rothfuss",                               
     "first_name": "Patrick"                                
 }

By joining and aggregating the data with a JOIN in a CTE and then building the joined object with a CONCAT operator (new in 9.5), I can place the book data inside the authors data.

If you have any thoughts, questions, or suggestions on improving these queries, please Tweet me @fritzy. If you'd like some help working with your data in this way, please contact us. Maybe it's time for you to migrate away from MongoDB?

&yet

The Blog