How to find a dataset with specific attributes?

The package datasetsand various packages come with many useful data sets, however, there seems to be no easy way to find your ideal data set when you need it for your sample packages, for training purposes, or ask / answer the question here about SO.

Say, for example, I want a data set to data.framehave at least 2 charactercolumns and has a length of less than 100 rows.

How can I examine each available data set and see the maximum relevant information to make my choice?

My past attempts were erratic, taking time and crashed with some packages that have an unusual structure of objects, for example caret.

+6
source share
4 answers

Expand / modify as you wish.

library(data.table)
dt = as.data.table(data(package = .packages(all.available = TRUE))$results)
dt = dt[, `:=`(Item   = sub(' \\(.*', '', Item),
               Object = sub('.*\\((.*)\\)', '\\1', Item))]

dt[, { 
       data(list = Object, package = Package)
       d = eval(parse(text = Item))

       classes = if (sum(class(d) %in% c('data.frame')) > 0) unlist(lapply(d, class))
                 else NA_integer_

       .(class    = paste(class(d), collapse = ","),
         nrow     = if (!is.null(nrow(d))) nrow(d) else NA_integer_,
         ncol     = if (!is.null(ncol(d))) ncol(d) else NA_integer_,
         charCols = sum(classes == 'character'),
         facCols  = sum(classes == 'factor'))
     }
   , by = .(Package, Item)]
#      Package          Item                                               class nrow ncol charCols facCols
#  1: datasets AirPassengers                                                  ts   NA   NA       NA      NA
#  2: datasets       BJsales                                                  ts   NA   NA       NA      NA
#  3: datasets  BJsales.lead                                                  ts   NA   NA       NA      NA
#  4: datasets           BOD                                          data.frame    6    2        0       0
#  5: datasets           CO2 nfnGroupedData,nfGroupedData,groupedData,data.frame   84    5        0       3
# ---                                                                                                      
#492: survival    transplant                                          data.frame  815    6        0       3
#493: survival        uspop2                                               array  101    2       NA      NA
#494: survival       veteran                                          data.frame  137    8        0       1
#495:  viridis   viridis.map                                          data.frame 1024    4        1       0
#496:   xtable           tli                                          data.frame  100    5        0       3
+2
source

I reworked @eddi kickass's answer like this:

  • This is a function.
  • It does not clutter up the workspace (it just loads data.table)
  • I renamed the columns to shorter ones and added a few more
  • Classes of list items are also checked.
  • I return the name of the object and dataset
  • Package warning warnings may be hidden

You just run this (if you installed data.table):

ds <- dataset_summary() # around 5 seconds if you have a lot of packages like me

It works with a package caretthat was problematic (see change history)

subset(ds,Package == "caret")
#     Package           Object             Item                                                   Title      class nrow ncol char fact ord num int list df
# 143   caret     GermanCredit     GermanCredit                                      German Credit Data data.frame 1000   62    0    1   0  54   7    0  0
# 144   caret       Sacramento       Sacramento                               Sacramento CA Home Prices data.frame  932    9    0    3   0   3   3    0  0
# 145   caret          tecator           absorp          Fat, Water and Protein Content of Meat Samples     matrix  215  100   NA   NA  NA  NA  NA   NA NA
# 146   caret       BloodBrain         bbbDescr                                Blood Brain Barrier Data data.frame  208  134    0    0   0 118  16    0  0
# 147   caret             cars             cars Kelly Blue Book resale data for 2005 model year GM cars data.frame  804   18    0    0   0   1  17    0  0
# 148   caret             cox2        cox2Class                                     COX-2 Activity Data     factor   NA   NA   NA   NA  NA  NA  NA   NA NA

Easily subset and search for a specific data set; classes of listelements and columns are counted data.frame.

subset(ds,class == 'list' & df > 0,select=-c(2,4))
#           Package           Item class nrow ncol char fact ord num int list df
# 225       ecodist       iris.fit  list   NA   NA    0    0   0   1   0    0  1
# 238 ElemStatLearn  orange10.test  list   NA   NA    0    0   0   0   0    0 50
# 239 ElemStatLearn orange10.train  list   NA   NA    0    0   0   0   0    0 50
# 240 ElemStatLearn   orange4.test  list   NA   NA    0    0   0   0   0    0 50
# 241 ElemStatLearn  orange4.train  list   NA   NA    0    0   0   0   0    0 50
# 346          lava    missingdata  list   NA   NA    0    0   0   0   0    0  4

Workspace is clean

ls()
# [1] "dataset_summary" "ds"

data.table.

search()
# [1] ".GlobalEnv"         "package:data.table" "package:Matrix"     "package:sp"         "package:timeSeries" "package:timeDate"  
# [7] "tools:rstudio"      "package:stats"      "package:graphics"   "package:grDevices"  "package:utils"      "package:datasets"  
# [13] "package:methods"    "Autoloads"          "package:base"

dataset_summary <- function(silent = TRUE){
  if(silent){
    w <- options()$warn
    options(warn = -1)
    on.exit(options(warn = w))
  }
  ws <- ls(envir=.GlobalEnv)
  library(data.table)
  dt = as.data.table(data(package = .packages(all.available = TRUE))$results)
  dt = dt[, `:=`(Item   = sub(' \\(.*', '', Item),
                 Object = sub('.*\\((.*)\\)', '\\1', Item))]

  df <- as.data.frame(dt[, { 
    data(list = Object, package = Package)
    d = eval(parse(text = Item))

    classes = if (sum(class(d) %in% c('data.frame','list')) > 0) unlist(lapply(d, class))
    else NA_integer_

    .(class    = paste(class(d), collapse = ","),
      nrow     = if (!is.null(nrow(d))) nrow(d) else NA_integer_,
      ncol     = if (!is.null(ncol(d))) ncol(d) else NA_integer_,
      char     = sum(classes == 'character'),
      fact     = sum(classes == 'factor'),
      ord      = sum(classes == 'ordered'),
      num      = sum(classes == 'numeric'),
      int      = sum(classes == 'integer'),
      list     = sum(classes == 'list'),
      df       = sum(classes == 'data.frame'))
  }
  , by = .(Package, Item)])
  rm(list=setdiff(ls(envir=.GlobalEnv),ws),envir=.GlobalEnv)
  df
} 
+3

datasets data.frame, , , data.frame 100 , character. .

library(datasets)
res <- library(help = "datasets")

dat <- unlist(lapply(strsplit(res$info[[2]], " "), '[[', 1))
dat <- dat[dat != ""]
df_names <- NULL
for(i in seq_along(dat)){
    d <- tryCatch(get(dat[i]), error = function(e) e)
    if(inherits(d, "data.frame")){
        if(nrow(d) <= 100){
            char <- sum(sapply(d, is.character))
            fact <- sum(sapply(d, is.factor))
            if(char >= 2 || fact >= 2){
                print(dat[i])
                df_names <- c(df_names, dat[i])
            }
        }
    }
}

df_names
[1] "CO2"        "esoph"      "npk"        "sleep"      "warpbreaks"

factor. stringsAsFactors = TRUE. , , df_names. , get , .

+1

, myfun(), , , coulmn.

caret , - . caret . , caret .

, , myfun() .

myfun <- function( package )
{
  t( sapply( ls( paste0( 'package:', package ) ), function(x){
    y <- eval(parse(text = paste0( package, "::`", x, "`")))
    data.frame( data_class = paste0(class(y), collapse = ","), 
                nrow = ifelse( any(class(y) %in% c( "data.frame", "matrix" ) ),
                               nrow(y), 
                               NA_integer_ ),
                ncol = ifelse( any(class(y) %in% c( "data.frame", "matrix" ) ),
                               ncol(y),
                               NA_integer_),
                classes = ifelse( any(class(y) %in% c( "data.frame", "matrix" ) ),
                                  paste0( unlist(lapply(y, class)), collapse = "," ),
                                  NA),
                stringsAsFactors = FALSE )

  } ) )
}

library( datasets )
meta_data <- myfun( package = "datasets")
head(meta_data)
#               data_class   nrow ncol classes                                                          
# ability.cov   "list"       NA   NA   NA                                                               
# airmiles      "ts"         NA   NA   NA                                                               
# AirPassengers "ts"         NA   NA   NA                                                               
# airquality    "data.frame" 153  6    "integer,integer,numeric,integer,integer,integer"                
# anscombe      "data.frame" 11   8    "numeric,numeric,numeric,numeric,numeric,numeric,numeric,numeric"
# attenu        "data.frame" 182  5    "numeric,numeric,factor,numeric,numeric"  

meta_data[ "ChickWeight", ]
# $data_class
# [1] "nfnGroupedData,nfGroupedData,groupedData,data.frame"
# 
# $nrow
# [1] 578
# 
# $ncol
# [1] 4
# 
# $classes
# [1] "numeric,numeric,ordered,factor,factor"

library( 'caret' )
meta_data <- myfun( package = "caret")
#               data_class nrow ncol classes
# anovaScores   "function" NA   NA   NA     
# avNNet        "function" NA   NA   NA     
# bag           "function" NA   NA   NA     
# bagControl    "function" NA   NA   NA     
# bagEarth      "function" NA   NA   NA     
# bagEarthStats "function" NA   NA   NA 

myfun() , :

loaded_pkgs <- search()
library( 'caret' )
meta_data <- myfun( package = "caret")
unload_pkgs <- setdiff( search(), loaded_pkgs )
for( i in unload_pkgs ) { 
  detach( pos = which( search() %in% i ) ) 
}
+1

All Articles