Extract comma-separated strings

I have a data frame as shown below. This is an example of data set with homogeneous patterns, but the whole data is not very homogeneous:

locationid      address     
1073744023  525 East 68th Street, New York, NY      10065, USA
1073744022  270 Park Avenue, New York, NY 10017, USA      
1073744025  Rockefeller Center, 50 Rockefeller Plaza, New York, NY 10020, USA 
1073744024  1251 Avenue of the Americas, New York, NY 10020, USA
1073744021  1301 Avenue of the Americas, New York, NY 10019, USA 
1073744026  44 West 45th Street, New York, NY 10036, USA

I need to find the city and country name at this address. I tried the following:

1) strsplit This gives me a list, but I cannot access the last or third last item from this.

2) Regular expressions find the country easily

str_sub(str_extract(address, "\\d{5},\\s.*"),8,11)

but for the city

str_sub(str_extract(address, ",\\s.+,\\s.+\\d{5}"),3,comma_pos)

I can not find comma_pos, as this again leads me to the same problem. I believe that there is a more effective way to solve this problem using any of the above methods.

+4
6

 ss <- strsplit(data,",")`

n <- sapply(s,len)

( ).

mapply(ss,"[[",n)

.

sapply(ss,tail,1)

.

- ( ),

sapply(ss,function(x) tail(x,2)[1])
+3

:

library(gsubfn)

cn <- c("Id", "Address", "City", "State", "Zip", "Country")

pat <- "(\\d+) (.+), (.+), (..) (\\d+), (.+)"
read.pattern(text = Lines, pattern = pat, col.names = cn, as.is = TRUE)

: , :

          Id                                  Address     City State   Zip Country
1 1073744023                     525 East 68th Street New York    NY 10065     USA
2 1073744022                          270 Park Avenue New York    NY 10017     USA
3 1073744025 Rockefeller Center, 50 Rockefeller Plaza New York    NY 10020     USA
4 1073744024              1251 Avenue of the Americas New York    NY 10020     USA
5 1073744021              1301 Avenue of the Americas New York    NY 10019     USA
6 1073744026                      44 West 45th Street New York    NY 10036     USA

( ):

(\d+) (.+), (.+), (..) (\d+), (.+)

debuggex - . Demuggex Demo:

Regular expression visualization

:

  • "(\\d+)" - ( Id),
  • " " ,
  • "(.+)" - ( Address),
  • ", " - ,
  • "(.+)" - ( City),
  • ", " - ,
  • "(..)" - ( State),
  • " " - ,
  • "(\\d+)" - ( Zip),
  • ", " - ,
  • "(.+)" - ( Country)

, , , , .

, , , read.pattern :

: Lines:

Lines <- "1073744023 525 East 68th Street, New York, NY 10065, USA
1073744022 270 Park Avenue, New York, NY 10017, USA
1073744025 Rockefeller Center, 50 Rockefeller Plaza, New York, NY 10020, USA
1073744024 1251 Avenue of the Americas, New York, NY 10020, USA
1073744021 1301 Avenue of the Americas, New York, NY 10019, USA
1073744026 44 West 45th Street, New York, NY 10036, USA"
+4

tidyr. , tidyr extract. , -, .

library(tidyr)

extract(x, address, c("address", "city", "state", "zip", "state"), 
    "([^,]+),\\s([^,]+),\\s+([A-Z]+)\\s+(\\d+),\\s+([A-Z]+)")

##   locationid                       address     city state   zip state
## 1 1073744023          525 East 68th Street New York    NY 10065   USA
## 2 1073744022               270 Park Avenue New York    NY 10017   USA
## 3 1073744025          50 Rockefeller Plaza New York    NY 10020   USA
## 4 1073744024   1251 Avenue of the Americas New York    NY 10020   USA
## 5 1073744021   1301 Avenue of the Americas New York    NY 10019   USA
## 6 1073744026           44 West 45th Street New York    NY 10036   USA

, http://www.regexper.com/:

enter image description here

+3

, - .

> x <- "1073744026 44 West 45th Street, New York, NY 10036, USA"
> regmatches(x, gregexpr('^[^,]+, *\\K[^,]+', x, perl=T))[[1]]
[1] "New York"
> regmatches(x, gregexpr('^[^,]+, *[^,]+, *[^,]+, *\\K[^\n,]+', x, perl=T))[[1]]
[1] "USA"

Regex:

  • ^ , .
  • [^,]+ , , . [^,]*, .
  • , ,
  • <space>* .
  • \K . , , \K, .
+2

:

,\s(?<city>[^,]+?),\s(?<shortCity>[^,]+?)(?i:\d{5},)(?<country>\s.*)

:

  • "group" : "city", "value" : "New York"
  • "group" : "shortCity", "value" : "NY"
  • "group" : "country", "value" : "USA"
0

rex .

x <- data.frame(
  locationid = c(
    1073744023,
    1073744022,
    1073744025,
    1073744024,
    1073744021,
    1073744026
    ),
  address = c(
    '525 East 68th Street, New York, NY      10065, USA',
    '270 Park Avenue, New York, NY 10017, USA',
    'Rockefeller Center, 50 Rockefeller Plaza, New York, NY 10020, USA',
    '1251 Avenue of the Americas, New York, NY 10020, USA',
    '1301 Avenue of the Americas, New York, NY 10019, USA',
    '44 West 45th Street, New York, NY 10036, USA'
    ))

library(rex)

sep <- rex(",", spaces)

re <-
  rex(
    capture(name = "address",
      except_some_of(",")
    ),
    sep,
    capture(name = "city",
      except_some_of(",")
    ),
    sep,
    capture(name = "state",
      uppers
    ),
    spaces,
    capture(name = "zip",
      some_of(digit, "-")
    ),
    sep,
    capture(name = "country",
      something
    ))

re_matches(x$address, re)
#>                      address     city state   zip country
#>1        525 East 68th Street New York    NY 10065     USA
#>2             270 Park Avenue New York    NY 10017     USA
#>3        50 Rockefeller Plaza New York    NY 10020     USA
#>4 1251 Avenue of the Americas New York    NY 10020     USA
#>5 1301 Avenue of the Americas New York    NY 10019     USA
#>6         44 West 45th Street New York    NY 10036     USA

9- (12345-1234) , .

0
source

All Articles