. RecordLinkage . .
structure(list(sno = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
7L, 8L), .Label = c("JHN", "JOHN", "JON", "SIRIS", "SIRIUS",
"SIRUS", "STEPHEN", "STIPHEN"), class = "factor"), names = structure(c(2L,
2L, 2L, 1L, 1L, 1L, 3L, 4L, 5L), .Label = c("BLACK", "DOE", "HARRY",
"HRYY", "HURRY"), class = "factor"), both.names = c("JHN DOE",
"JOHN DOE", "JON DOE", "SIRIS BLACK", "SIRIUS BLACK", "SIRUS BLACK",
"STEPHEN HARRY", "STEPHEN HRYY", "STIPHEN HURRY")), .Names = c("sno",
"names", "both.names"), row.names = c("009", "002", "006", "001",
"004", "005", "007", "003", "008"), class = "data.frame")
library("RecordLinkage")
compareJW <- function(string, vec, cutoff) {
require(RecordLinkage)
jarowinkler(string, vec) > cutoff
}
shortenFirms <- function(firms, cutoff) {
shortnames <- firms[1]
firms <- firms[-1]
for (firm in firms) {
if (is.na(firm)) { # no firm name, so short-circuit and add an NA
shortnames <- c(shortnames, NA)
next
}
unique.short <- unique(shortnames[!is.na(shortnames)])
hits <- compareJW(firm, unique.short, cutoff)
if (sum(hits) > 1) {
warning(paste("cassifyFirms: more than one match for", firm))
shortnames <- c(shortnames, NA)
} else if (sum(hits) == 0) {
shortnames <- c(shortnames, firm)
} else {
shortnames <- c(shortnames, unique.short[hits])
}
}
shortnames
}
shortenFirms(df$both.names, 0.8)
shortenFirms (df $both.names, 0.8)
[1] "JHN DOE" "JHN DOE" "JHN DOE" "SIRIS BLACK" "SIRIS BLACK" "SIRIS BLACK" "STEPHEN HARRY"
[8] "STEPHEN HARRY" "STEPHEN HARRY"