I am working on some text conversion procedures that parse time values ββin different formats in Ruby. This procedure is growing in complexity, and I'm currently testing the best approach to this problem.
I am currently testing a use case scanf. What for? I always thought this was faster than regex, but what happened in Ruby? It was a lot slower!
What am I doing wrong?
Note. I am using ruby-1.9.2-p290 [x86_64] (MRI)
First Ruby Test:
require "scanf"
require 'benchmark'
def duration_in_seconds_regex(duration)
if duration =~ /^\d{2,}\:\d{2}:\d{2}$/
h, m, s = duration.split(":").map{ |n| n.to_i }
h * 3600 + m * 60 + s
end
end
def duration_in_seconds_scanf(duration)
a = duration.scanf("%d:%d:%d")
a[0] * 3600 + a[1] * 60 + a[2]
end
n = 500000
Benchmark.bm do |x|
x.report { for i in 1..n; duration_in_seconds_scanf("00:10:30"); end }
end
Benchmark.bm do |x|
x.report { for i in 1..n; duration_in_seconds_regex("00:10:30"); end }
end
This is what I got with the scanffirst and second regular expression:
user system total real
95.020000 0.280000 95.300000 ( 96.364077)
user system total real
2.820000 0.000000 2.820000 ( 2.835170)
Second test using C:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/types.h>
#include <string.h>
#include <regex.h>
char *regexp(char *string, char *patrn, int *begin, int *end) {
int i, w = 0, len;
char *word = NULL;
regex_t rgT;
regmatch_t match;
regcomp(&rgT, patrn, REG_EXTENDED);
if ((regexec(&rgT, string, 1, &match, 0)) == 0) {
*begin = (int) match.rm_so;
*end = (int) match.rm_eo;
len = *end - *begin;
word = malloc(len + 1);
for (i = *begin; i<*end; i++) {
word[w] = string[i];
w++;
}
word[w] = 0;
}
regfree(&rgT);
return word;
}
int main(int argc, char** argv) {
char * str = "00:01:30";
int h, m, s;
int i, b, e;
float start_time, end_time, time_elapsed;
regex_t regex;
regmatch_t * pmatch;
char msgbuf[100];
char *pch;
char *str2;
char delims[] = ":";
char *result = NULL;
start_time = (float) clock() / CLOCKS_PER_SEC;
for (i = 0; i < 500000; i++) {
if (sscanf(str, "%d:%d:%d", &h, &m, &s) == 3) {
s = h * 3600L + m * 60L + s;
}
}
end_time = (float) clock() / CLOCKS_PER_SEC;
time_elapsed = end_time - start_time;
printf("sscanf_time (500k iterations): %.4f", time_elapsed);
start_time = (float) clock() / CLOCKS_PER_SEC;
for (i = 0; i < 500000; i++) {
char * match = regexp(str, "[0-9]{2,}:[0-9]{2}:[0-9]{2}", &b, &e);
if (strcmp(match, str) == 0) {
str2 = (char*) malloc(sizeof (str));
strcpy(str2, str);
h = strtok(str2, delims);
m = strtok(NULL, delims);
s = strtok(NULL, delims);
s = h * 3600L + m * 60L + s;
}
}
end_time = (float) clock() / CLOCKS_PER_SEC;
time_elapsed = end_time - start_time;
printf("\n\nregex_time (500k iterations): %.4f", time_elapsed);
return (EXIT_SUCCESS);
}
The results of the C code are obviously faster, and the regex results are slower than the results scanf, as expected:
sscanf_time (500k iterations): 0.1774
regex_time (500k iterations): 3.9692
, C , , , , Ruby - .
gist.