How to create example data set from private data (replacing variable names and levels with uninformative place holders)?

I don’t know whether there was a function to automate this, but now there is 😉

## A function to anonymise columns in 'colIDs' 
##    colIDs can be either column names or integer indices
anonymiseColumns <- function(df, colIDs) {
    id <- if(is.character(colIDs)) match(colIDs, names(df)) else colIDs
    for(id in colIDs) {
        prefix <- sample(LETTERS, 1)
        suffix <- as.character(as.numeric(as.factor(df[[id]])))
        df[[id]] <- paste(prefix, suffix, sep="")
    }
    names(df)[id] <- paste("V", id, sep="")
    df
}

## A data.frame containing sensitive information
df <- data.frame(
    name = rep(readLines(file.path(R.home("doc"), "AUTHORS"))[9:13], each=2),
    hiscore = runif(10, 99, 100),
    passwd = replicate(10, paste(sample(c(LETTERS, letters), 9), collapse="")))

## Anonymise it
df2 <- anonymiseColumns(df, c(1,3))

## Check that it worked
> head(df, 3)
           name  hiscore    passwd
1 Douglas Bates 99.96714 ROELIAncz
2 Douglas Bates 99.07243 gDOLNMyVe
3 John Chambers 99.55322 xIVPHDuEW    

> head(df2, 3)
  name hiscore  V3
1   Q1 99.96714 V8
2   Q1 99.07243 V2
3   Q2 99.55322 V9

Leave a Comment