Recology

R/etc.

 

nonoyes - text analysis of Reply All podcast transcripts

Reply All is a great podcast. I’ve been wanting to learn some text analysis tools, and transcripts from the podcast are on their site.

Took some approaches outlined in the tidytext package in this vignette, and used the tokenizers package, and some of the tidyverse.

Code on github at sckott/nonoyes

Also check out the html version

Setup

Load deps

library("httr")
library("xml2")
library("stringi")
library("dplyr")
library("ggplot2")
library("tokenizers")
library("tidytext")
library("tidyr")

source helper functions

source("funs.R")

set base url

ra_base <- "https://gimletmedia.com/show/reply-all/episodes"

URLs

Make all urls for each page of episodes

urls <- c(ra_base, file.path(ra_base, "page", 2:8))

Get urls for each episode

res <- lapply(urls, get_urls)

Remove those that are rebroadcasts, updates, or revisited

res <- grep("rebroadcast|update|revisited", unlist(res), value = TRUE, invert = TRUE)

Episode names

Give some episodes numbers that don’t have them

epnames <- sub("/$", "", sub("https://gimletmedia.com/episode/", "", res))
epnames <- sub("the-anxiety-box", "8-the-anxiety-box", epnames)
epnames <- sub("french-connection", "10-french-connection", epnames)
epnames <- sub("ive-killed-people-and-i-have-hostages", "15-ive-killed-people-and-i-have-hostages", epnames)
epnames <- sub("6-this-proves-everything", "75-this-proves-everything", epnames)
epnames <- sub("zardulu", "56-zardulu", epnames)

Transcripts

Get transcripts

txts <- lapply(res, transcript_fetch, sleep = 1)

Parse transcripts

txtsp <- lapply(txts, transcript_parse)

Summary word usage

Summarise data for each transcript

dat <- stats::setNames(lapply(txtsp, function(m) {
  bind_rows(lapply(m, function(v) {
    tmp <- unname(vapply(v, nchar, 1))
    data_frame(
      n = length(tmp),
      mean = mean(tmp),
      n_laugh = count_word(v, "laugh"),
      n_groan = count_word(v, "groan")
    )
  }), .id = "name")
}), epnames)

Bind data together to single dataframe, and filter, summarise

data <- bind_rows(dat, .id = "episode") %>%
  filter(!is.na(episode)) %>%
  filter(grepl("^PJ$|^ALEX GOLDMAN$", name)) %>%
  mutate(ep_no = as.numeric(strextract(episode, "^[0-9]+"))) %>%
  group_by(ep_no) %>%
  mutate(nrow = NROW(ep_no)) %>%
  ungroup() %>%
  filter(nrow == 2)
data
#> # A tibble: 114 × 8
#>                 episode         name     n      mean n_laugh n_groan ep_no
#>                   <chr>        <chr> <int>     <dbl>   <int>   <int> <dbl>
#> 1            73-sandbox           PJ    89 130.65169       9       0    73
#> 2            73-sandbox ALEX GOLDMAN    25  44.00000       1       1    73
#> 3       72-dead-is-paul           PJ   137  67.77372      17       0    72
#> 4       72-dead-is-paul ALEX GOLDMAN    90  61.82222       8       0    72
#> 5  71-the-picture-taker           PJ    74  77.70270       3       0    71
#> 6  71-the-picture-taker ALEX GOLDMAN    93 105.94624       6       0    71
#> 7        69-disappeared           PJ    72  76.50000       2       0    69
#> 8        69-disappeared ALEX GOLDMAN    50 135.90000       5       0    69
#> 9      68-vampire-rules           PJ   142  88.00704       6       0    68
#> 10     68-vampire-rules ALEX GOLDMAN   117  73.16239      13       0    68
#> # ... with 104 more rows, and 1 more variables: nrow <int>

Number of words - seems PJ talks more, but didn’t do quantiative comparison

ggplot(data, aes(ep_no, n, colour = name)) +
  geom_point(size = 3, alpha = 0.5) +
  geom_line(aes(group = ep_no), colour = "black") +
  scale_color_discrete(labels = c('Alex', 'PJ'))

Laughs per episode - take home: PJ laughs a lot

ggplot(data, aes(ep_no, n_laugh, colour = name)) +
  geom_point(size = 3, alpha = 0.5) +
  geom_line(aes(group = ep_no), colour = "black") +
  scale_color_discrete(labels = c('Alex', 'PJ'))

Sentiment

zero <- which(vapply(txtsp, length, 1) == 0)
txtsp_ <- Filter(function(x) length(x) != 0, txtsp)

Tokenize words, and create data_frame

wordz <- stats::setNames(
  lapply(txtsp_, function(z) {
    bind_rows(
      if (is.null(try_tokenize(z$`ALEX GOLDMAN`))) {
        data_frame()
      } else {
        data_frame(
          name = "Alex",
          word = try_tokenize(z$`ALEX GOLDMAN`)
        )
      },
      if (is.null(try_tokenize(z$PJ))) {
        data_frame()
      } else {
        data_frame(
          name = "PJ",
          word = try_tokenize(z$PJ)
        )
      }
    )
  }), epnames[-zero])

Combine to single data_frame

(wordz_df <- bind_rows(wordz, .id = "episode"))
#> # A tibble: 104,713 × 3
#>       episode  name      word
#>         <chr> <chr>     <chr>
#> 1  73-sandbox  Alex      alex
#> 2  73-sandbox  Alex   goldman
#> 3  73-sandbox  Alex         i
#> 4  73-sandbox  Alex generally
#> 5  73-sandbox  Alex     don’t
#> 6  73-sandbox  Alex      alex
#> 7  73-sandbox  Alex    really
#> 8  73-sandbox  Alex      alex
#> 9  73-sandbox  Alex    groans
#> 10 73-sandbox  Alex        so
#> # ... with 104,703 more rows

Calculate sentiment using tidytext

bing <- sentiments %>%
  filter(lexicon == "bing") %>%
  select(-score)
sent <- wordz_df %>%
  inner_join(bing) %>%
  count(name, episode, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) %>%
  ungroup() %>%
  filter(!is.na(episode)) %>%
  complete(episode, name) %>%
  mutate(ep_no = as.numeric(strextract(episode, "^[0-9]+")))
sent
#> # A tibble: 148 × 6
#>                                        episode  name negative positive
#>                                          <chr> <chr>    <dbl>    <dbl>
#> 1  1-an-app-sends-a-stranger-to-say-i-love-you  Alex       19       30
#> 2  1-an-app-sends-a-stranger-to-say-i-love-you    PJ       14       14
#> 3                         10-french-connection  Alex       15       32
#> 4                         10-french-connection    PJ       16       36
#> 5     11-did-errol-morris-brother-invent-email  Alex       NA       NA
#> 6     11-did-errol-morris-brother-invent-email    PJ       25       30
#> 7                           12-backend-trouble  Alex       20       15
#> 8                           12-backend-trouble    PJ       40       59
#> 9                              13-love-is-lies  Alex       NA       NA
#> 10                             13-love-is-lies    PJ       45       64
#> # ... with 138 more rows, and 2 more variables: sentiment <dbl>,
#> #   ep_no <dbl>

Names separate

ggplot(sent, aes(ep_no, sentiment, fill = name)) +
  geom_bar(stat = "identity") +
  facet_wrap(~name, ncol = 2, scales = "free_x")

Compare for each episode

ggplot(sent, aes(ep_no, sentiment, fill = name)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.5), width = 0.6)

Most common positive and negative words

Clearly, the word like is surely rarely used as a positive word meaning e.g., that they like something, but rather as the colloquial like, totally usage. So it’s removed.

Alex

sent_cont_plot(wordz_df, "Alex")

PJ

sent_cont_plot(wordz_df, "PJ")

video editing notes

This is how I edit videos of talks that I need to incorporate slides and video together

I’m on a Mac

  • import to iMovie (using v10 something)
  • drop movie into editing section
  • split pdf slides into individual files pdfseparate foobar.pdf %d.pdf
  • convert individual pdf slides into png sips -s format png --out "${pdf%%.*}.png" "$pdf"
  • import png’s into imovie
  • for each image, drop into editing area where you want it
  • when focused on the png of the slide:
    • select crop, then - choose fit, say okay
    • select “add as overlay” (very most left symbol), then choose picture in picture
    • then choose swap
    • then move inset to where you want it
    • say okay
  • rinse and repeat for all slides
  • export - via File option
  • share to youtube

e.g. of the result

Marine Regions data in R

UPDATE: pkg API has changed - updated the post below to work with the current CRAN version, submitted 2016-08-02

I was at a hackathon focused on Ocean Biogeographic Information System (OBIS) data back in November last year in Belgium. One project idea was to make it easier to get at data based on one or more marine regions. I was told that Marineregions.org is often used for shape files to get different regions to then do other work with.

During the event I started a package mregions. Here I’ll show how to get different marine regions, then use those outputs to get species occurrence data.

We’ll use WKT (Well-Known Text) to define spatial dimensions in this post. If you don’t know what it is, Wikipedia to the rescue: https://en.wikipedia.org/wiki/Well-known_text

Install

install.packages("mregions")
devtools::install_github("iobis/robis")

Or get the dev version

devtools::install_github("ropenscilabs/mregions")
library("mregions")

Get list of place types

res <- mr_place_types()
head(res$type)
#> [1] "Town"                      "Arrondissement"           
#> [3] "Department"                "Province (administrative)"
#> [5] "Country"                   "Continent"

Get Marineregions records by place type

res <- mr_records_by_type(type = "EEZ")
head(res)
#>   MRGID                                            gazetteerSource
#> 1  3293 Maritime Boundaries Geodatabase, Flanders Marine Institute
#> 2  5668 Maritime Boundaries Geodatabase, Flanders Marine Institute
#> 3  5669 Maritime Boundaries Geodatabase, Flanders Marine Institute
#> 4  5670 Maritime Boundaries Geodatabase, Flanders Marine Institute
#> 5  5672 Maritime Boundaries Geodatabase, Flanders Marine Institute
#> 6  5673 Maritime Boundaries Geodatabase, Flanders Marine Institute
#>   placeType latitude longitude minLatitude minLongitude maxLatitude
#> 1       EEZ 51.46483  2.704458    51.09111     2.238118    51.87000
#> 2       EEZ 53.61508  4.190675    51.26203     2.539443    55.76500
#> 3       EEZ 54.55970  8.389231    53.24281     3.349999    55.91928
#> 4       EEZ 40.87030 19.147094    39.63863    18.461940    41.86124
#> 5       EEZ 42.94272 29.219062    41.97820    27.449580    43.74779
#> 6       EEZ 43.42847 15.650844    41.62201    13.001390    45.59079
#>   maxLongitude precision            preferredGazetteerName
#> 1     3.364907  58302.49   Belgian Exclusive Economic Zone
#> 2     7.208364 294046.10     Dutch Exclusive Economic Zone
#> 3    14.750000 395845.50    German Exclusive Economic Zone
#> 4    20.010030 139751.70  Albanian Exclusive Economic Zone
#> 5    31.345280 186792.50 Bulgarian Exclusive Economic Zone
#> 6    18.552360 313990.30  Croatian Exclusive Economic Zone
#>   preferredGazetteerNameLang   status accepted
#> 1                    English standard     3293
#> 2                    English standard     5668
#> 3                    English standard     5669
#> 4                    English standard     5670
#> 5                    English standard     5672
#> 6                    English standard     5673

Get and search region names

(res <- mr_names())
#> # A tibble: 676 x 4
#>                              name
#>                             <chr>
#> 1           Morocco:elevation_10m
#> 2          Emodnet:emodnet1x1grid
#> 3                    Emodnet:grid
#> 4                     Morocco:dam
#> 5             WoRMS:azmp_sections
#> 6    Morocco:accomodationcapacity
#> 7          Morocco:admin_boundary
#> 8  Lifewatch:ovam_afvalverwerking
#> 9          Eurobis:eurobis_points
#> 10                  Morocco:roads
#> # ... with 666 more rows, and 3 more variables: title <chr>,
#> #   name_first <chr>, name_second <chr>
mr_names_search(res, q = "IHO")
#> # A tibble: 5 x 4
#>                                   name
#>                                  <chr>
#> 1                    MarineRegions:iho
#> 2 MarineRegions:iho_quadrants_20150810
#> 3                     World:iosregions
#> 4       MarineRegions:eez_iho_union_v2
#> 5                   Belgium:vl_venivon
#> # ... with 3 more variables: title <chr>, name_first <chr>,
#> #   name_second <chr>

Get a region - geojson

res <- mr_geojson(name = "Turkmen Exclusive Economic Zone")
class(res)
#> [1] "mr_geojson"
names(res)
#> [1] "type"          "totalFeatures" "features"      "crs"          
#> [5] "bbox"

Get a region - shp

res <- mr_shp(name = "Belgian Exclusive Economic Zone")
class(res)
#> [1] "SpatialPolygonsDataFrame"
#> attr(,"package")
#> [1] "sp"

Get OBIS EEZ ID

res <- mr_names()
res <- res[grepl("eez", res$name, ignore.case = TRUE),]
mr_obis_eez_id(res$title[2])
#> [1] 84

Convert to WKT

From geojson or shp. Here, geojson

res <- mr_geojson(key = "MarineRegions:eez_33176")
mr_as_wkt(res, fmt = 5)
#> [1] "MULTIPOLYGON (((41.573732 -1.659444, 45.891882 ... cutoff

Get regions, then OBIS data

Using Well-Known Text

Both shp and geojson data returned from region_shp() and region_geojson(), respectively, can be passed to as_wkt() to get WKT.

shp <- mr_shp(name = "Belgian Exclusive Economic Zone")
wkt <- mr_as_wkt(shp)
library('httr')
library('data.table')
args <- list(scientificname = "Abra alba", geometry = wkt, limit = 100)
res <- httr::GET('http://api.iobis.org/occurrence', query = args)
xx <- data.table::setDF(data.table::rbindlist(httr::content(res)$results, use.names = TRUE, fill = TRUE))
xx <- xx[, c('scientificName', 'decimalLongitude', 'decimalLatitude')]
names(xx)[2:3] <- c('longitude', 'latitude')

Plot

library('leaflet')
leaflet() %>%
  addTiles() %>%
  addCircleMarkers(data = xx) %>%
  addPolygons(data = shp)

map1

Using EEZ ID

EEZ is a Exclusive Economic Zone

(eez <- mr_obis_eez_id("Belgian Exclusive Economic Zone"))
#> [1] 59

You currently can’t search for OBIS occurrences by EEZ ID, but hopefully soon…

Dealing with bigger WKT

What if you’re WKT string is super long? It’s often a problem because some online species occurrence databases that accept WKT to search by geometry bork due to limitations on length of URLs if your WKT string is too long (about 8000 characters, including remainder of URL). One way to deal with it is to reduce detail - simplify.

install.packages("rmapshaper")

Using rmapshaper we can simplify a spatial object, then search with that.

shp <- mr_shp(name = "Dutch Exclusive Economic Zone")

Visualize

leaflet() %>%
  addTiles() %>%
  addPolygons(data = shp)

map2

Simplify

library("rmapshaper")
shp <- ms_simplify(shp)

It’s simplified:

leaflet() %>%
  addTiles() %>%
  addPolygons(data = shp)

map3

Pass to GBIF

if (!requireNamespace("rgbif")) {
  install.packages("rgbif")
}
library("rgbif")
occ_search(geometry = mr_as_wkt(shp), limit = 400)
#> Records found [2395936] 
#> Records returned [400] 
#> No. unique hierarchies [17] 
#> No. media records [3] 
#> Args [geometry=POLYGON ((7.2083632399999997 53.2428091399999985,
#>      6.6974995100000001 53.4619365499999972, 5.89083650, limit=400,
#>      offset=0, fields=all] 
#> # A tibble: 400 x 102
#>                     name        key decimalLatitude decimalLongitude
#>                    <chr>      <int>           <dbl>            <dbl>
#> 1  Haematopus ostralegus 1265900666        52.13467          4.21306
#> 2          Limosa limosa 1265577408        53.03249          4.88665
#> 3       Corvus splendens 1269536058        51.98297          4.12982
#> 4       Corvus splendens 1269536065        51.98297          4.12982
#> 5       Lanius excubitor 1211320606        52.57223          4.62569
#> 6       Lanius excubitor 1211320862        52.67419          4.63645
#> 7       Lanius excubitor 1211320806        53.05790          4.72974
#> 8       Lanius excubitor 1211321040        52.57540          4.63651
#> 9       Lanius excubitor 1211320590        52.41180          5.19500
#> 10      Lanius excubitor 1211320401        52.57535          4.63654
#> # ... with 390 more rows, and 98 more variables: issues <chr>,
#> #   datasetKey <chr>, publishingOrgKey <chr>, publishingCountry <chr>,
#> #   protocol <chr>, lastCrawled <chr>, lastParsed <chr>, extensions <chr>,
#> #   basisOfRecord <chr>, taxonKey <int>, kingdomKey <int>,
#> #   phylumKey <int>, classKey <int>, orderKey <int>, familyKey <int>,
#> #   genusKey <int>, speciesKey <int>, scientificName <chr>, kingdom <chr>,
#> #   phylum <chr>, order <chr>, family <chr>, genus <chr>, species <chr>,
#> #   genericName <chr>, specificEpithet <chr>, taxonRank <chr>,
#> #   dateIdentified <chr>, coordinateUncertaintyInMeters <dbl>, year <int>,
#> #   month <int>, day <int>, eventDate <chr>, modified <chr>,
#> #   lastInterpreted <chr>, references <chr>, identifiers <chr>,
#> #   facts <chr>, relations <chr>, geodeticDatum <chr>, class <chr>,
#> #   countryCode <chr>, country <chr>, rightsHolder <chr>,
#> #   identifier <chr>, informationWithheld <chr>, verbatimEventDate <chr>,
#> #   datasetName <chr>, gbifID <chr>, collectionCode <chr>,
#> #   verbatimLocality <chr>, occurrenceID <chr>, taxonID <chr>,
#> #   license <chr>, recordedBy <chr>, catalogNumber <chr>,
#> #   http...unknown.org.occurrenceDetails <chr>, institutionCode <chr>,
#> #   rights <chr>, eventTime <chr>, identificationID <chr>,
#> #   individualCount <int>, continent <chr>, stateProvince <chr>,
#> #   nomenclaturalCode <chr>, locality <chr>, language <chr>,
#> #   taxonomicStatus <chr>, type <chr>, preparations <chr>,
#> #   disposition <chr>, originalNameUsage <chr>,
#> #   bibliographicCitation <chr>, identifiedBy <chr>, sex <chr>,
#> #   lifeStage <chr>, otherCatalogNumbers <chr>, habitat <chr>,
#> #   georeferencedBy <chr>, vernacularName <chr>, elevation <dbl>,
#> #   minimumDistanceAboveSurfaceInMeters <chr>, dynamicProperties <chr>,
#> #   samplingEffort <chr>, organismName <chr>, georeferencedDate <chr>,
#> #   georeferenceProtocol <chr>, georeferenceVerificationStatus <chr>,
#> #   organismID <chr>, ownerInstitutionCode <chr>, samplingProtocol <chr>,
#> #   datasetID <chr>, accessRights <chr>, georeferenceSources <chr>,
#> #   elevationAccuracy <dbl>, depth <dbl>, depthAccuracy <dbl>,
#> #   waterBody <chr>