Extract comma-separated strings

Question

Extract comma-separated strings

I have a data frame as shown below. This is an example of data set with homogeneous patterns, but the whole data is not very homogeneous:

locationid      address     
1073744023  525 East 68th Street, New York, NY      10065, USA
1073744022  270 Park Avenue, New York, NY 10017, USA      
1073744025  Rockefeller Center, 50 Rockefeller Plaza, New York, NY 10020, USA 
1073744024  1251 Avenue of the Americas, New York, NY 10020, USA
1073744021  1301 Avenue of the Americas, New York, NY 10019, USA 
1073744026  44 West 45th Street, New York, NY 10036, USA

I need to find the city and country name at this address. I tried the following:

1) strsplit This gives me a list, but I cannot access the last or third last item from this.

2) Regular expressions find the country easily

str_sub(str_extract(address, "\\d{5},\\s.*"),8,11)

but for the city

str_sub(str_extract(address, ",\\s.+,\\s.+\\d{5}"),3,comma_pos)

I can not find comma_pos, as this again leads me to the same problem. I believe that there is a more effective way to solve this problem using any of the above methods.

+4

string regex r strsplit

Cagg Nov 27 '14 at 13:36

6

:

library(gsubfn)

cn <- c("Id", "Address", "City", "State", "Zip", "Country")

pat <- "(\\d+) (.+), (.+), (..) (\\d+), (.+)"
read.pattern(text = Lines, pattern = pat, col.names = cn, as.is = TRUE)

: , :

          Id                                  Address     City State   Zip Country
1 1073744023                     525 East 68th Street New York    NY 10065     USA
2 1073744022                          270 Park Avenue New York    NY 10017     USA
3 1073744025 Rockefeller Center, 50 Rockefeller Plaza New York    NY 10020     USA
4 1073744024              1251 Avenue of the Americas New York    NY 10020     USA
5 1073744021              1301 Avenue of the Americas New York    NY 10019     USA
6 1073744026                      44 West 45th Street New York    NY 10036     USA

( ):

(\d+) (.+), (.+), (..) (\d+), (.+)

debuggex - . Demuggex Demo:

:

"(\\d+)" - ( Id),
" " ,
"(.+)" - ( Address),
", " - ,
"(.+)" - ( City),
", " - ,
"(..)" - ( State),
" " - ,
"(\\d+)" - ( Zip),
", " - ,
"(.+)" - ( Country)

, , , , .

, , , read.pattern :

: Lines:

Lines <- "1073744023 525 East 68th Street, New York, NY 10065, USA
1073744022 270 Park Avenue, New York, NY 10017, USA
1073744025 Rockefeller Center, 50 Rockefeller Plaza, New York, NY 10020, USA
1073744024 1251 Avenue of the Americas, New York, NY 10020, USA
1073744021 1301 Avenue of the Americas, New York, NY 10019, USA
1073744026 44 West 45th Street, New York, NY 10036, USA"

+4

G. Grothendieck 27 . '14 15:16

tidyr. , tidyr extract. , -, .

library(tidyr)

extract(x, address, c("address", "city", "state", "zip", "state"), 
    "([^,]+),\\s([^,]+),\\s+([A-Z]+)\\s+(\\d+),\\s+([A-Z]+)")

##   locationid                       address     city state   zip state
## 1 1073744023          525 East 68th Street New York    NY 10065   USA
## 2 1073744022               270 Park Avenue New York    NY 10017   USA
## 3 1073744025          50 Rockefeller Plaza New York    NY 10020   USA
## 4 1073744024   1251 Avenue of the Americas New York    NY 10020   USA
## 5 1073744021   1301 Avenue of the Americas New York    NY 10019   USA
## 6 1073744026           44 West 45th Street New York    NY 10036   USA

, http://www.regexper.com/:

+3

Tyler Rinker 27 . '14 15:38

, - .

> x <- "1073744026 44 West 45th Street, New York, NY 10036, USA"
> regmatches(x, gregexpr('^[^,]+, *\\K[^,]+', x, perl=T))[[1]]
[1] "New York"
> regmatches(x, gregexpr('^[^,]+, *[^,]+, *[^,]+, *\\K[^\n,]+', x, perl=T))[[1]]
[1] "USA"

Regex:

^ , .
[^,]+ , , . [^,]*, .
, ,
<space>* .
\K . , , \K, .

+2

Avinash Raj 27 . '14 13:46

:

,\s(?<city>[^,]+?),\s(?<shortCity>[^,]+?)(?i:\d{5},)(?<country>\s.*)

:

"group" : "city", "value" : "New York"
"group" : "shortCity", "value" : "NY"
"group" : "country", "value" : "USA"

0

Bambuk 27 . '14 13:54

rex .

x <- data.frame(
  locationid = c(
    1073744023,
    1073744022,
    1073744025,
    1073744024,
    1073744021,
    1073744026
    ),
  address = c(
    '525 East 68th Street, New York, NY      10065, USA',
    '270 Park Avenue, New York, NY 10017, USA',
    'Rockefeller Center, 50 Rockefeller Plaza, New York, NY 10020, USA',
    '1251 Avenue of the Americas, New York, NY 10020, USA',
    '1301 Avenue of the Americas, New York, NY 10019, USA',
    '44 West 45th Street, New York, NY 10036, USA'
    ))

library(rex)

sep <- rex(",", spaces)

re <-
  rex(
    capture(name = "address",
      except_some_of(",")
    ),
    sep,
    capture(name = "city",
      except_some_of(",")
    ),
    sep,
    capture(name = "state",
      uppers
    ),
    spaces,
    capture(name = "zip",
      some_of(digit, "-")
    ),
    sep,
    capture(name = "country",
      something
    ))

re_matches(x$address, re)
#>                      address     city state   zip country
#>1        525 East 68th Street New York    NY 10065     USA
#>2             270 Park Avenue New York    NY 10017     USA
#>3        50 Rockefeller Plaza New York    NY 10020     USA
#>4 1251 Avenue of the Americas New York    NY 10020     USA
#>5 1301 Avenue of the Americas New York    NY 10019     USA
#>6         44 West 45th Street New York    NY 10036     USA

9- (12345-1234) , .

0

Jim Nov 28 '14 at 14:34

source share

Ben Bolker · Accepted Answer · 2014-11-27T13:47:03+0000

 ss <- strsplit(data,",")`

n <- sapply(s,len)

( ).

mapply(ss,"[[",n)

.

sapply(ss,tail,1)

.

- ( ),

sapply(ss,function(x) tail(x,2)[1])

Extract comma-separated strings

More articles: