Links with space before and after are not processed correctly

I have a site that I am browsing that has a space before and after the URL

<a href=" /c/96894 ">Test</a> 

Instead of going around this:

 http://www.stores.com/c/96894/ 

he scans this:

 http://www.store.com/c/%0A%0A/c/96894%0A%0A 

In addition, it calls an infinite loop for links that contain the same link:

 http://www.store.com/cp/%0A%0A/cp/96894%0A%0A/cp/96894%0A%0A 

Any white space ( \r , \n , \t and space) before and after the URL is ignored by all browsers. How do I crop bypass URL spaces?

Here is my code.

 from scrapy.selector import HtmlXPathSelector from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from wallspider.items import Website class StoreSpider(CrawlSpider): name = "cpages" allowed_domains = ["www.store.com"] start_urls = ["http://www.sore.com",] rules = ( Rule (SgmlLinkExtractor(allow=('/c/', ),deny=('grid=false', 'sort=', 'stores=', '\|\|', 'page=',)) , callback="parse_items", follow= True, process_links=lambda links: [link for link in links if not link.nofollow],), Rule(SgmlLinkExtractor(allow=(),deny=('grid=false', 'sort=', 'stores=', '\|\|', 'page='))), ) def parse_items(self, response): hxs = HtmlXPathSelector(response) sites = hxs.select('//html') items = [] for site in sites: item = Website() item['url'] = response.url item['referer'] = response.request.headers.get('Referer') item['anchor'] = response.meta.get('link_text') item['canonical'] = site.xpath('//head/link[@rel="canonical"]/@href').extract() item['robots'] = site.select('//meta[@name="robots"]/@content').extract() items.append(item) return items 
0
source share
2 answers

I used process_value = cleanurl in an instance of LinkExtractor

 def cleanurl(link_text): return link_text.strip("\t\r\n ") 

Code if someone is facing the same problem:

 from scrapy.selector import HtmlXPathSelector from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from wallspider.items import Website class storeSpider(CrawlSpider): name = "cppages" allowed_domains = ["www.store.com"] start_urls = ["http://www.store.com",] def cleanurl(link_text): return link_text.strip("\t\r\n '\"") rules = ( Rule (SgmlLinkExtractor(allow=('/cp/', ),deny=('grid=false', 'sort=', 'stores=', r'\|\|', 'page=',), process_value=cleanurl) , callback="parse_items", follow= True, process_links=lambda links: [link for link in links if not link.nofollow],), Rule(SgmlLinkExtractor(allow=('/cp/', '/browse/', ),deny=('grid=false', 'sort=', 'stores=', r'\|\|', 'page='), process_value=cleanurl)), ) def parse_items(self, response): hxs = HtmlXPathSelector(response) sites = hxs.select('//html') items = [] for site in sites: item = Website() item['url'] = response.url item['referer'] = response.request.headers.get('Referer') item['anchor'] = response.meta.get('link_text') item['canonical'] = site.xpath('//head/link[@rel="canonical"]/@href').extract() item['robots'] = site.select('//meta[@name="robots"]/@content').extract() items.append(item) return items 
+1
source

You can replace the space with '' like,

 url = response.url item['url'] = url.replace(' ', '') 

Or using regex

 import re url = response.url item['url'] = re.sub(r'\s', '', url) 
0
source

All Articles