R text mining: grouping similar patterns from a data frame.

I applied various cleaning functions from the package tm, such as removing punctuation marks, numbers, special characters, ordinary English words, etc. and received the data frame as shown below. Remember that I don’t have a primary key like cust_id or account_number to rely on

sno        names
001        SIRIS BLACK
002        JOHN DOE
003        STEPHEN HRYY
004        SIRIUS BLACK
005        SIRUS BLACK
006        JON DOE
007        STEPHEN HARRY
008        STIPHEN HURRY
009        JHN DOE 

Looking at the above data, I really feel that there is a similarity of patterns and that these names are close to each other. How to calculate the percentage of equality of patterns using the available text mining functions from R so that I can finally get a data frame with all the unique names?

Assumptions and Disadvantages:

  • , , , , , . ( , , )

  • agrep() , , , .

:

sno        names
001        SIRIS BLACK          
002        SIRIUS BLACK
003        SIRUS BLACK
004        JHN DOE
005        JOHN DOE
006        JON DOE
007        STEPHEN HARRY
008        STIPHEN HURRY
009        STEPHEN HRYY

, , :

001     JOHN DOE
002     STEPHEN HARRY
003     STIPHEN HURRY
004     SIRIUS BLACK
+4
2

agrep - :

sim <- setNames(lapply(1:nrow(df), function(i) agrep(df$names[i], df$names, max.distance = list(all=2, insertions=2, deletions=2, substitutions=0))), df$names)
sim <- lapply(sim, function(x) unique(df$names[x]))
df$names2 <- sapply(sim, "[", 1)
df[!duplicated(df$names2), ]
#   sno         names        names2
# 1   1   SIRIS BLACK   SIRIS BLACK
# 2   2      JOHN DOE      JOHN DOE
# 3   3  STEPHEN HRYY  STEPHEN HRYY
# 8   8 STIPHEN HURRY STIPHEN HURRY
+4

. RecordLinkage . .

structure(list(sno = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 
7L, 8L), .Label = c("JHN", "JOHN", "JON", "SIRIS", "SIRIUS", 
"SIRUS", "STEPHEN", "STIPHEN"), class = "factor"), names = structure(c(2L, 
2L, 2L, 1L, 1L, 1L, 3L, 4L, 5L), .Label = c("BLACK", "DOE", "HARRY", 
"HRYY", "HURRY"), class = "factor"), both.names = c("JHN DOE", 
"JOHN DOE", "JON DOE", "SIRIS BLACK", "SIRIUS BLACK", "SIRUS BLACK", 
"STEPHEN HARRY", "STEPHEN HRYY", "STIPHEN HURRY")), .Names = c("sno", 
"names", "both.names"), row.names = c("009", "002", "006", "001", 
"004", "005", "007", "003", "008"), class = "data.frame")

library("RecordLinkage")
compareJW <- function(string, vec, cutoff) {
  require(RecordLinkage)
  jarowinkler(string, vec) > cutoff
}

shortenFirms <- function(firms, cutoff) {
  shortnames <- firms[1]
  firms <- firms[-1]

  for (firm in firms) {
    if (is.na(firm)) { # no firm name, so short-circuit and add an NA
      shortnames <- c(shortnames, NA)
      next

    }
    unique.short <- unique(shortnames[!is.na(shortnames)])
    hits <- compareJW(firm, unique.short, cutoff)
    if (sum(hits) > 1) {
      warning(paste("cassifyFirms: more than one match for", firm))
      shortnames <- c(shortnames, NA)
    } else if (sum(hits) == 0) {
      shortnames <- c(shortnames, firm)
    } else {
      shortnames <- c(shortnames, unique.short[hits])
    }
  }
  shortnames
}

shortenFirms(df$both.names, 0.8)

shortenFirms (df $both.names, 0.8)

[1] "JHN DOE"       "JHN DOE"       "JHN DOE"       "SIRIS BLACK"   "SIRIS BLACK"   "SIRIS BLACK"   "STEPHEN HARRY"
[8] "STEPHEN HARRY" "STEPHEN HARRY"
0

All Articles