I recently moved the scraper code using Curl to CodeIgniter. I am using the Curl CI library from http://philsturgeon.co.uk/code/codeigniter-curl . I put the cleanup process in the controller, and then found that the runtime of my scraper is slower than the one I built in simple PHP.
It took 12 seconds for CodeIgniter to output the result, while simple PHP only takes 6 seconds. Both include parsing with the HTML DOM parser.
Here is my Curl code in CodeIgniter:
function curl($url, $postdata=false) { $agent = "Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4) Gecko/20030624 Netscape/7.1 (ax)"; $this->curl->create($url); $this->curl->ssl(false); $options = array( 'URL' => $url, 'HEADER' => 0, 'AUTOREFERER' => true, 'FOLLOWLOCATION' => true, 'TIMEOUT' => 60, 'RETURNTRANSFER' => 1, 'USERAGENT' => $agent, 'COOKIEJAR' => dirname(__FILE__) . "/cookie.txt", 'COOKIEFILE' => dirname(__FILE__) . "/cookie.txt", ); if($postdata) { $this->curl->post($postdata, $options); } else { $this->curl->options($options); } return $this->curl->execute(); }
non codeigniter (plain php):
function curl ($ url, $ binary = false, $ post = false, $ cookie = false) {
$ch = curl_init(); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // Accepts all CAs curl_setopt ($ch, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt ($ch, CURLOPT_URL, $url ); curl_setopt ($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_REFERER, $url); curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate'); curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_TIMEOUT, 60); curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1); if($cookie){ $agent = "Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4) Gecko/20030624 Netscape/7.1 (ax)"; curl_setopt($ch, CURLOPT_USERAGENT, $agent); curl_setopt($ch, CURLOPT_COOKIEJAR, dirname(__FILE__) . "/cookie.txt"); curl_setopt($ch, CURLOPT_COOKIEFILE, dirname(__FILE__) . "/cookie.txt"); } if($binary) curl_setopt($ch, CURLOPT_BINARYTRANSFER, 1); if($post){ foreach($post as $key=>$value) { $post_array_string1 .= $key.'='.$value.'&'; } $post_array_string1 = rtrim($post_array_string1,'&'); //set the url, number of POST vars, POST data curl_setopt($ch, CURLOPT_POST, true); curl_setopt($ch, CURLOPT_POSTFIELDS, $post_array_string1); } return curl_exec ($ch);
}
Does anyone know why this CodeIgniter Curl is slower? or maybe because the parser is simple_html_dom ??