python爬虫爬取网页上的所有文件

今天AI让我把一个课程的slides都下载一下,就写了一个爬取页面所有链接的脚本。

单线程爬虫

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# coding:utf-8
import re
import requests

# 获取网页内容
r = requests.get('http://www.cs.cmu.edu/afs/cs/academic/class/15745-s06/web/schedule.html')
data = r.text

# 利用正则查找所有链接
link_list =re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" ,data)

# 保存成文件
for url in link_list:
print url
r = requests.get(url)
# print url.split('/')[-1]
with open(url.split('/')[-1], "wb") as f:
f.write(r.content)

多线程爬虫

思路:先通过爬虫查找到所有链接,然后放进队列里下载。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# coding:utf-8
import re
import requests
import threading, Queue

lock = threading.Lock()


def get_queue(list):
link_queue = Queue.Queue()
for p in list:
link_queue.put(p)
return link_queue


class GetPdf(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)

def get_pdf(self, link):
global lock
print link
r = requests.get(link)
try:
lock.acquire()
# print url.split('/')[-1]
with open(link.split('/')[-1], "wb") as f:
f.write(r.content)
lock.release()
except IOError:
print "Error: 读取文件失败"


class GetPdfMulti(GetPdf):
def __init__(self, link_queue):
GetPdf.__init__(self)
self.link_queue = link_queue

def run(self):
while not link_queue.empty():
link = self.link_queue.get()
self.get_pdf(link)


if '__main__' == __name__:
# 获取网页内容
r = requests.get('http://www.cs.cmu.edu/afs/cs/academic/class/15745-s06/web/schedule.html')
data = r.text
thread_num = 20
# 利用正则查找所有链接
link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", data)
link_queue = get_queue(link_list)
ThreadList = []
for i in range(0, thread_num):
t = GetPdfMulti(link_queue)
ThreadList.append(t)
for t in ThreadList:
t.start()
for t in ThreadList:
t.join()
print "done"