So far, I can successfully sort Spanish words with accented vowels by specifying the UTF-8 locale in std :: sort,
// [[Rcpp::export]] std::vector<std::string> sort_words(std::vector<std::string> x) { std::sort(x.begin(), x.end(), std::locale("en_US.UTF-8")); return x; } /*** R words <- c("casa", "árbol", "zona", "árbol", "casa", "libro") sort_words(words) */ returns (as expected): [1] "árbol" "árbol" "casa" "casa" "libro" "zona"
I cannot figure out how to do the same with the map:
// slightly modified version of tableC on http://adv-r.had.co.nz/Rcpp.html // [[Rcpp::export]] std::map<String, int> table_words(CharacterVector x) { std::setlocale(LC_ALL, "en_US.UTF-8"); // std::setlocale(LC_COLLATE, "en_US.UTF-8"); // also tried this instead of previous line std::map<String, int> counts; int n = x.size(); for (int i = 0; i < n; i++) { counts[x[i]]++; } return counts; } /*** R words <- c("casa", "árbol", "zona", "árbol", "casa", "libro") table_words(words) */ returns: casa libro zona árbol 2 1 1 2 but I want: árbol casa libro zona 2 2 1 1
Any ideas on how table_words put the accented "ábbol" before "casa", with Rcpp, or even return to R using base::sort ?
Also, std::sort(..., std::locale("en_US.UTF-8")) just the words on my Linux machine with: gcc version 4.8.2 (Ubuntu 4.8.2-19ubuntu1). It does not work on Mac 10.10.3 with: Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn). Any hints that my Mac compiler is missing from my Linux compiler?
Here is my script and my sessionInfo for both machines:
// [[Rcpp::plugins(cpp11)]] #include <locale> #include <clocale> #include <Rcpp.h> using namespace Rcpp; // [[Rcpp::export]] std::vector<std::string> sort_words(std::vector<std::string> x) { std::sort(x.begin(), x.end(), std::locale("en_US.UTF-8")); return x; } // [[Rcpp::export]] std::map<String, int> table_words(CharacterVector x) { // std::setlocale(LC_ALL, "en_US.UTF-8"); // tried this instead of next line std::setlocale(LC_COLLATE, "en_US.UTF-8"); std::map<String, int> counts; int n = x.size(); for (int i = 0; i < n; i++) { counts[x[i]]++; } return counts; } /*** R words <- c("casa", "árbol", "zona", "árbol", "casa", "libro") sort_words(words) table_words(words) sort(table_words(words), decreasing = T) output_from_Rcpp <- table_words(words) sort(names(output_from_Rcpp)) */ > words <- c("casa", "árbol", "zona", "árbol", "casa", "libro") > sort_words(words) [1] "árbol" "árbol" "casa" "casa" "libro" "zona" > table_words(words) casa libro zona árbol 2 1 1 2 > sort(table_words(words), decreasing = T) casa árbol libro zona 2 2 1 1 > output_from_Rcpp <- table_words(words) > sort(names(output_from_Rcpp)) [1] "árbol" "casa" "libro" "zona" sessionInfo on linux machine: R version 3.2.0 (2015-04-16) Platform: x86_64-pc-linux-gnu (64-bit) Running under: Ubuntu 14.04 LTS locale: [1] en_US.UTF-8 attached base packages: [1] stats graphics grDevices utils datasets methods base loaded via a namespace (and not attached): [1] tools_3.2.0 Rcpp_0.11.6 sessionInfo on Mac: R version 3.2.1 (2015-06-18) Platform: x86_64-apple-darwin13.4.0 (64-bit) Running under: OS X 10.10.3 (Yosemite) locale: [1] en_US.UTF-8 attached base packages: [1] stats graphics grDevices utils datasets methods base other attached packages: [1] textcat_1.0-3 readr_0.1.1 rvest_0.2.0 loaded via a namespace (and not attached): [1] httr_1.0.0 selectr_0.2-3 R6_2.1.0 magrittr_1.5 tools_3.2.1 curl_0.9.1 Rcpp_0.11.6 slam_0.1-32 stringi_0.5-5 [10] tau_0.0-18 stringr_1.0.0 XML_3.98-1.3