Split Data into Test and Train Set

Split data from vector Y into two sets in predefined ratio while preserving relative ratios of different labels in Y. Used to split the data used during classification into train and test subsets.

sample.split( Y, SplitRatio = 2/3, group = NULL )

Arguments

Y

Vector of data labels. If there are only a few labels (as is expected) than relative ratio of data in both subsets will be the same.

SplitRatio

Splitting ratio:

if (0<=SplitRatio<1) then SplitRatio fraction of points from Y will be set toTRUE
if (SplitRatio==1) then one random point from Y will be set to TRUE
if (SplitRatio>1) then SplitRatio number of points from Y will be set to TRUE

group

Optional vector/list used when multiple copies of each sample are present. In such a case group contains unique sample labels, marking all copies of the same sample with the same label, and the function tries to place all copies in either train or test subset. If provided than has to have the same length as Y.

Details

Function msc.sample.split is the old name of the sample.split function. To be retired soon. Note that the function differs from base::sample by first restricting the input data set to its unique values before generating the subset(s).

Value

Returns logical vector of the same length as Y with random SplitRatio*length(Y) elements set to TRUE.

Author

Jarek Tuszynski (SAIC) jaroslaw.w.tuszynski@saic.com

Examples

  library(MASS)
  data(cats)   # load cats data
  Y = cats[,1] # extract labels from the data
  msk = sample.split(Y, SplitRatio=3/4)
  table(Y,msk)
#>    msk
#> Y   FALSE TRUE
#>   F    12   35
#>   M    24   73
  t=sum( msk)  # number of elements in one class
  f=sum(!msk)  # number of elements in the other class
  stopifnot( round((t+f)*3/4) == t ) # test ratios
  
  # example of using group variable
  g = rep(seq(length(Y)/4), each=4); g[48]=12;
  msk = sample.split(Y, SplitRatio=1/2, group=g)
  table(Y,msk) # try to get correct split ratios ...
#>    msk
#> Y   FALSE TRUE
#>   F    23   24
#>   M    49   48
  split(msk,g) # ... while keeping samples with the same group label together
#> $`1`
#> [1] TRUE TRUE TRUE TRUE
#> 
#> $`2`
#> [1] FALSE FALSE FALSE FALSE
#> 
#> $`3`
#> [1] FALSE FALSE FALSE FALSE
#> 
#> $`4`
#> [1]  TRUE FALSE FALSE FALSE
#> 
#> $`5`
#> [1] FALSE FALSE FALSE FALSE
#> 
#> $`6`
#> [1] TRUE TRUE TRUE TRUE
#> 
#> $`7`
#> [1] TRUE TRUE TRUE TRUE
#> 
#> $`8`
#> [1] TRUE TRUE TRUE TRUE
#> 
#> $`9`
#> [1] FALSE FALSE FALSE FALSE
#> 
#> $`10`
#> [1] FALSE FALSE FALSE FALSE
#> 
#> $`11`
#> [1] TRUE TRUE TRUE TRUE
#> 
#> $`12`
#> [1]  TRUE  TRUE  TRUE FALSE
#> 
#> $`13`
#> [1] TRUE TRUE TRUE TRUE
#> 
#> $`14`
#> [1] FALSE FALSE FALSE FALSE
#> 
#> $`15`
#> [1] TRUE TRUE TRUE TRUE
#> 
#> $`16`
#> [1] TRUE TRUE TRUE TRUE
#> 
#> $`17`
#> [1] TRUE TRUE TRUE TRUE
#> 
#> $`18`
#> [1] FALSE FALSE FALSE FALSE
#> 
#> $`19`
#> [1] TRUE TRUE TRUE TRUE
#> 
#> $`20`
#> [1] FALSE FALSE FALSE FALSE
#> 
#> $`21`
#> [1] TRUE TRUE TRUE TRUE
#> 
#> $`22`
#> [1] FALSE FALSE FALSE FALSE
#> 
#> $`23`
#> [1] TRUE TRUE TRUE TRUE
#> 
#> $`24`
#> [1] TRUE TRUE TRUE TRUE
#> 
#> $`25`
#> [1] FALSE FALSE FALSE FALSE
#> 
#> $`26`
#> [1] FALSE FALSE FALSE FALSE
#> 
#> $`27`
#> [1] TRUE TRUE TRUE TRUE
#> 
#> $`28`
#> [1] FALSE FALSE FALSE FALSE
#> 
#> $`29`
#> [1] FALSE FALSE FALSE FALSE
#> 
#> $`30`
#> [1] TRUE TRUE TRUE TRUE
#> 
#> $`31`
#> [1] TRUE TRUE TRUE TRUE
#> 
#> $`32`
#> [1] FALSE FALSE FALSE FALSE
#> 
#> $`33`
#> [1] FALSE FALSE FALSE FALSE
#> 
#> $`34`
#> [1] TRUE TRUE TRUE TRUE
#> 
#> $`35`
#> [1] FALSE FALSE FALSE FALSE
#> 
#> $`36`
#> [1] FALSE FALSE FALSE FALSE
#> 

  # test results
  print(paste( "All Labels numbers: total=",t+f,", train=",t,", test=",f,
        ", ratio=", t/(t+f) ) )
#> [1] "All Labels numbers: total= 144 , train= 108 , test= 36 , ratio= 0.75"
  U = unique(Y)       # extract all unique labels
  for( i in 1:length(U)) {  # check for all labels
    lab = (Y==U[i])   # mask elements that have label U[i]
    t=sum( msk[lab])  # number of elements with label U[i] in one class
    f=sum(!msk[lab])  # number of elements with label U[i] in the other class 
    print(paste( "Label",U[i],"numbers: total=",t+f,", train=",t,", test=",f, 
                 ", ratio=", t/(t+f) ) )
  }
#> [1] "Label F numbers: total= 47 , train= 24 , test= 23 , ratio= 0.51063829787234"
#> [1] "Label M numbers: total= 97 , train= 48 , test= 49 , ratio= 0.494845360824742"
  
  # use results
  train = cats[ msk,2:3]  # use output of sample.split to ...
  test  = cats[!msk,2:3]  # create train and test subsets
  z = lda(train, Y[msk])  # perform classification
  table(predict(z, test)$class, Y[!msk]) # predicted & true labels
#>    
#>      F  M
#>   F 13  5
#>   M 10 44
  
  # see also LogitBoost example

Arguments

Details

Value

Author

See also

Examples