defget_pdf(self, link): global lock print link r = requests.get(link) try: lock.acquire() # print url.split('/')[-1] with open(link.split('/')[-1], "wb") as f: f.write(r.content) lock.release() except IOError: print"Error: 读取文件失败"
defrun(self): whilenot link_queue.empty(): link = self.link_queue.get() self.get_pdf(link)
if'__main__' == __name__: # 获取网页内容 r = requests.get('http://www.cs.cmu.edu/afs/cs/academic/class/15745-s06/web/schedule.html') data = r.text thread_num = 20 # 利用正则查找所有链接 link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", data) link_queue = get_queue(link_list) ThreadList = [] for i in range(0, thread_num): t = GetPdfMulti(link_queue) ThreadList.append(t) for t in ThreadList: t.start() for t in ThreadList: t.join() print"done"