边学边做-Python网站爬虫(一)
结合以前JAVA代码,用Python写了一个类似的爬虫,为了更好的练习,因此未采用诸如beautyful soup之类的强大工具库,代码比较挫:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
# encoding=utf8 import urllib2 import re class sitecrawl: def __init__(self,url): self.url = url def crawl(self): site = self.url head = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept':'text/html;q=0.9,*/*;q=0.8', 'Accept-Language' : 'zh-CN', 'Accept-Charset':'gb2312', } req = urllib2.Request(site,headers=head) html = urllib2.urlopen(req).read() #urlre = re.compile(r']*?href=.*?<\/a>') urllist = re.findall('href=".*?(.*?)"',html) #urllist = urlre.findall(html) href = [] #','.join(urllist) #print urllist for i in urllist: if ".pdf" in i or "@" in i or ".rar" in i or ".zip" in i or ".jpg" in i or ".js" in i or ".css" in i: continue if i[0:7] != "http://": i = self.url + i if i[-1] == "#" or i[-1] == "/": i = i[0:len(i)-1] #href.append(i) #print href print i #print i[0:7] #print i[-3:] #print href go = sitecrawl('http://www.37.com/') go.crawl() |
接下来考虑加入去重,多线程先缓一缓吧