Recology

R/etc.

PUT dataframes on your couch

    R couchdb sofa
 Source: .Rmd/.md

It would be nice to easily push each row or column of a data.frame into CouchDB instead of having to prepare them yourself into JSON, then push in to couch. I recently added ability to push data.frame’s into couch using the normal PUT /{db} method, and added support for the couch bulk API.

Install

install.packages("devtools")
devtools::install_github("sckott/sofa")
library("sofa")

PUT /db

You can write directly from a data.frame, either by rows or columns. First, rows:

#> $ok
#> [1] TRUE

Create a database

db_create(dbname="mtcarsdb")
#> $ok
#> [1] TRUE
out <- doc_create(mtcars, dbname="mtcarsdb", how="rows")
out[1:2]
#> $`Mazda RX4`
#> $`Mazda RX4`$ok
#> [1] TRUE
#> 
#> $`Mazda RX4`$id
#> [1] "0063109bfb1c15765854cbc9525c3a7a"
#> 
#> $`Mazda RX4`$rev
#> [1] "1-3946941c894a874697554e3e6d9bc176"
#> 
#> 
#> $`Mazda RX4 Wag`
#> $`Mazda RX4 Wag`$ok
#> [1] TRUE
#> 
#> $`Mazda RX4 Wag`$id
#> [1] "0063109bfb1c15765854cbc9525c461d"
#> 
#> $`Mazda RX4 Wag`$rev
#> [1] "1-273ff17a938cb956cba21051ab428b95"

Then by columns

out <- doc_create(mtcars, dbname="mtcarsdb", how="columns")
out[1:2]
#> $mpg
#> $mpg$ok
#> [1] TRUE
#> 
#> $mpg$id
#> [1] "0063109bfb1c15765854cbc9525d4f1f"
#> 
#> $mpg$rev
#> [1] "1-4b83d0ef53a28849a872d47ad03fef9a"
#> 
#> 
#> $cyl
#> $cyl$ok
#> [1] TRUE
#> 
#> $cyl$id
#> [1] "0063109bfb1c15765854cbc9525d57d3"
#> 
#> $cyl$rev
#> [1] "1-c21bfa5425c67743f0826fd4b44b0dbf"

Bulk API

The bulk API will/should be faster for larger data.frames

#> $ok
#> [1] TRUE

We’ll use part of the diamonds dataset

library("ggplot2")
dat <- diamonds[1:20000,]

Create a database

db_create(dbname="bulktest")
#> $ok
#> [1] TRUE

Load by row (could instead do each column, see how parameter), printing the time it takes

system.time(out <- bulk_create(dat, dbname="bulktest"))
#>    user  system elapsed 
#>  16.832   6.039  24.432

The returned data is the same as with doc_create()

out[1:2]
#> [[1]]
#> [[1]]$ok
#> [1] TRUE
#> 
#> [[1]]$id
#> [1] "0063109bfb1c15765854cbc9525d8b8d"
#> 
#> [[1]]$rev
#> [1] "1-f407fe4935af7fd17c101f13d3c81679"
#> 
#> 
#> [[2]]
#> [[2]]$ok
#> [1] TRUE
#> 
#> [[2]]$id
#> [1] "0063109bfb1c15765854cbc9525d998b"
#> 
#> [[2]]$rev
#> [1] "1-cf8b9a9dcdc026052a663d6fef8a36fe"

So that’s 20,000 rows in not that much time, not bad.

not dataframes

You can also pass in lists or vectors of json as character strings, e.g.,

lists

#> $ok
#> [1] TRUE
row.names(mtcars) <- NULL # get rid of row.names
lst <- parse_df(mtcars, tojson=FALSE)
db_create(dbname="bulkfromlist")
#> $ok
#> [1] TRUE
out <- bulk_create(lst, dbname="bulkfromlist")
out[1:2]
#> [[1]]
#> [[1]]$ok
#> [1] TRUE
#> 
#> [[1]]$id
#> [1] "ba70c46d73707662b1e204a90fcd9bb8"
#> 
#> [[1]]$rev
#> [1] "1-3946941c894a874697554e3e6d9bc176"
#> 
#> 
#> [[2]]
#> [[2]]$ok
#> [1] TRUE
#> 
#> [[2]]$id
#> [1] "ba70c46d73707662b1e204a90fcda9f6"
#> 
#> [[2]]$rev
#> [1] "1-273ff17a938cb956cba21051ab428b95"

json

#> $ok
#> [1] TRUE
strs <- as.character(parse_df(mtcars, "columns"))
db_create(dbname="bulkfromchr")
#> $ok
#> [1] TRUE
out <- bulk_create(strs, dbname="bulkfromchr")
out[1:2]
#> [[1]]
#> [[1]]$ok
#> [1] TRUE
#> 
#> [[1]]$id
#> [1] "ba70c46d73707662b1e204a90fce8c20"
#> 
#> [[1]]$rev
#> [1] "1-4b83d0ef53a28849a872d47ad03fef9a"
#> 
#> 
#> [[2]]
#> [[2]]$ok
#> [1] TRUE
#> 
#> [[2]]$id
#> [1] "ba70c46d73707662b1e204a90fce9bc1"
#> 
#> [[2]]$rev
#> [1] "1-c21bfa5425c67743f0826fd4b44b0dbf"
comments powered by Disqus