I use the Bioconductor CMA package to perform Monte Carlo cross-validation (MCCV) in the SVM classifiers in the microarray dataset, CMA internally uses the e1071 R package for SVM operation.
The data set contains 387 variables (attributes) for 45 samples (observations) that belong to one of two classes (labels 0 or 1, proportion about 1: 1). All data are numerical without NA. I am trying to use a 1000 iterative MCCV with 15 variables selected for SVM using limma statisticsfor differential analysis of gene expression. During MCCV, part of a set of 45 samples is used to train the SVM classifier, which is then used to check the remaining fraction, and I try to use different values for the fraction set for training. CMA also performs internal loop checks (by default, triple cross-validation in training sets) to fine tune the classifiers that will be used for cross-validation compared to test sets. All this is done from the CMA package.
Sometimes, for low training kit sizes, CMA displays an error in the console and stops the rest of the code to classify from execution.
[snip] tuning iteration 575
tuning iteration 576
tuning iteration 577
Error in predict.svm (ret, xhold, decision.values = TRUE): Model is empty!
This happens even when I use a test other than limma to select variables, or instead of two variables, two are used instead of 15 variables to generate the classifier. The R code I use should ensure that training sets are always members of both classes. I would appreciate any understanding of this.
Below is the R code I'm using with Mac OS X 10.6.6, R 2.12.1, Biobase 2.10.0, CMA 1.8.1, limma 3.6.9, and WilcoxCV 1.0.2. The hy3ExpHsaMir.txt data file can be downloaded from http://rapidshare.com/files/447062901/hy3ExpHsaMir.txt .
Everything goes fine until g is 9 in the for loop (g in 0:10) (to resize the workout / test).
# exp is the expression table, a matrix; 'classes' is list of known classes
exp <- as.matrix(read.table(file='hy3ExpHsaMir.txt', sep='\t', row.names=1, header=T, check.names=F))
#best is to use 0 and 1 as class labels (instead of 'p', 'g', etc.) with 1 for 'positive' items (positive for recurrence, or for disease, etc.)
classes <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
yesPredVal = 1 # class label for 'positive' items in 'classes'
library(CMA)
library(WilcoxCV)
myNumFun <- function(x, y){round(y(as.numeric(x), na.rm=T), 4)}
set.seed(631)
out = ''
out2 = '\nEffect of varying the training-set size:\nTraining-set size\tSuccessful iterations\tMean acc.\tSD acc.\tMean sens.\tSD sens.\tMean spec.\tSD spec.\tMean PPV\tSD PPV\tMean NPV\tSD NPV\tTotal genes in the classifiers\n'
niter = 1000
diffTest = 'limma'
diffGeneNum = 15
svmCost <- c(0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50)
for(g in 0:10){ # varying the training/test-set sizes
ntest = 3+g*3 # test-set size
result <- matrix(nrow=0, ncol=7)
colnames(result) <- c('trainSetSize', 'iteration', 'acc', 'sens', 'spec', 'ppv', 'npv')
diffGenes <- numeric()
# generate training and test sets
lsets <- GenerateLearningsets(n=ncol(exp), y=classes, method=c('MCCV'), niter=niter, ntrain=ncol(exp)-ntest)
# actual prediction work
svm <- classification(t(exp), factor(classes), learningsets=lsets, genesellist= list(method=diffTest), classifier=svmCMA, nbgene= diffGeneNum, tuninglist=list(grids=list(cost=svmCost)), probability=TRUE)
svm <- join(svm)
# genes in classifiers
svmGenes <- GeneSelection(t(exp), classes, learningsets=lsets, method=diffTest)
actualIters=0
for(h in 1:niter){
m <- ntest*(h-1)
# valid SVM classification requires min. 2 classes
if(1 < length(unique(classes[-lsets@learnmatrix[h,]]))){
actualIters = actualIters+1
tp <- tn <- fp <- fn <- 0
for(i in 1:ntest){
pred <- svm@yhat[m+i]
known <- svm@y[m+i]
if(pred == known){
if(pred == yesPredVal){tp <- tp+1}
else{tn <- tn+1}
}else{
if(pred == yesPredVal){fp <- fp+1}
else{fn <- fn+1}
}
}
result <- rbind(result, c(ncol(exp)-ntest, h, (tp+tn)/(tp+tn+fp+fn), tp/(tp+fn), tn/(tn+fp), tp/(tp+fp), tn/(tn+fn)))
diffGenes <- c(diffGenes, toplist(svmGenes, k=diffGeneNum, iter=h, show=F)$index)
} # end if valid SVM
} # end for h
# output accuracy, etc.
out = paste(out, 'SVM MCCV using ', niter, ' attempted iterations and ', actualIters, ' successful iterations, with ', ncol(exp)-ntest, ' of ', ncol(exp), ' total samples used for training:\nThe means (ranges; SDs) of prediction accuracy, sensitivity, specificity, PPV and NPV in fractions are ',
myNumFun(result[, 'acc'],mean), ' (', myNumFun(result[, 'acc'], min), '-', myNumFun(result[, 'acc'], max), '; ', myNumFun(result[, 'acc'], sd), '), ',
myNumFun(result[, 'sens'], mean), ' (', myNumFun(result[, 'sens'], min), '-', myNumFun(result[, 'sens'], max), '; ', myNumFun(result[, 'sens'], sd), '), ',
myNumFun(result[, 'spec'], mean), ' (', myNumFun(result[, 'spec'], min), '-', myNumFun(result[, 'spec'], max), '; ', myNumFun(result[, 'spec'], sd), '), ',
myNumFun(result[, 'ppv'], mean), ' (', myNumFun(result[, 'ppv'], min), '-', myNumFun(result[, 'ppv'], max), '; ', myNumFun(result[, 'ppv'], sd), '), and ',
myNumFun(result[, 'npv'], mean), ' (', myNumFun(result[, 'npv'], min), '-', myNumFun(result[, 'npv'], max), '; ', myNumFun(result[, 'npv'], sd), '), respectively.\n', sep='')
# output classifier genes
diffGenesUnq <- unique(diffGenes)
out = paste(out, 'A total of ', length(diffGenesUnq), ' genes occur in the ', actualIters, ' classifiers, with occurrence frequencies in fractions:\n', sep='')
for(i in 1:length(diffGenesUnq)){
out = paste(out, rownames(exp)[diffGenesUnq[i]], '\t', round(sum(diffGenes == diffGenesUnq[i])/actualIters, 3), '\n', sep='')
}
# output split-size effect
out2 = paste(out2, ncol(exp)-ntest, '\t', actualIters, '\t', myNumFun(result[, 'acc'], mean), '\t', myNumFun(result[, 'acc'], sd), '\t', myNumFun(result[, 'sens'], mean), '\t', myNumFun(result[, 'sens'], sd), '\t', myNumFun(result[, 'spec'], mean), '\t', myNumFun(result[, 'spec'], sd), '\t', myNumFun(result[, 'ppv'], mean), '\t', myNumFun(result[, 'ppv'], sd),
'\t', myNumFun(result[, 'npv'], mean), '\t', myNumFun(result[, 'npv'], sd), '\t', length(diffGenesUnq), '\n', sep='')
} # end for g
cat(out, out2, sep='')
Output for traceback ():
20: stop ("Model is empty!")
19: predict.svm (ret, xhold, decision.values = TRUE)
18: predict(ret, xhold, decision.values = TRUE)
17: na.action(predict(ret, xhold, decision.values = TRUE))
16: svm.default(cost = 0.1, kernel = "linear", type = "C-classification", ...
15: svm(cost = 0.1, kernel = "linear", type = "C-classification", ...
14: do.call("svm", args = ll)
13: function (X, y, f, learnind, probability, models = FALSE, ...) ...
12: function (X, y, f, learnind, probability, models = FALSE, ...) ...
11: do.call(classifier, args = c(list(X = X, y = y, learnind = learnmatrix[i, ...
10: classification(X = c(83.5832768669369, 83.146333099001, 94.253534443549, ...
9: classification(X = c(83.5832768669369, 83.146333099001, 94.253534443549, ...
8: do.call("classification", args = c(list(X = Xi, y = yi, learningsets = lsi, ...
7: tune(grids = list(cost = c(0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50...
6: tune(grids = list(cost = c(0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50...
5: do.call("tune", args = c(tuninglist, ll))
4: classification(X, y = as.numeric(y) - 1, learningsets = learningsets, ...
3: classification(X, y = as.numeric(y) - 1, learningsets = learningsets, ...
2: classification(t(exp), factor(classes), learningsets = lsets, ...
1: classification(t(exp), factor(classes), learningsets = lsets, ...