python使用多线程下载图片

哎,这几天好蛋疼,一直等着签合同,一直没通知,就这样整天呆在宿舍里无所事事的,什么都提不起兴趣….哎,赶紧正式开始上班吧,正式上班之后就有工资了。

1f7be3l

 

昨晚无聊写的,主要使用了多线程来实现,一共10000个页面,创建了100个线程,每个线程控制100个页面。
这货在我电脑上面跑到时候就是下面这样子
tupian
下面是代码

#author:liangliang
#email:liangliangyy@gmail.com
#blog:http://www.lylinux.org/
import urllib2
from bs4 import BeautifulSoup
import re
import os
import sys

import threading 

reload(sys)
sys.setdefaultencoding('utf-8')

baseurl = "http://www.topit.me/tag/%E7%BE%8E%E5%A5%B3?p="

if(os.path.exists('beauty') == False):
    os.mkdir('beauty')

def useragent(url):
    i_headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) \
    AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36", \
    "Referer": 'http://baidu.com/'}

    req = urllib2.Request(url, headers=i_headers)
    html = urllib2.urlopen(req).read()
    return html

#img = myimg.find_all(['img'])
#link = img[0]['src']

def getpageimg(url):
    try:
        html = useragent(url)
        #print html
        soup = BeautifulSoup(html)
        for imgurl in soup.findAll('div', style="padding-top: 5px;"):
            img = imgurl.find_all(['a'])
            finallimg =  img[0]['href']
            print finallimg
            content2 = urllib2.urlopen(finallimg).read()

            with open(u'beauty'+'/'+finallimg[-11:],'wb') as code:
                code.write(content2)
    except:
        pass

def pageloop(url):
    html = useragent(url)
    soup = BeautifulSoup(html)
    for imgpage in soup.findAll(attrs={"class" : "e m"}):
        #print imgpage
        pattern = re.compile(r'http://www.topit.me/item/[0-9]*')
        pageurl = pattern.findall(str(imgpage))
       # print pageurl[0]
        getpageimg(pageurl[0])

class getmyimg(threading.Thread):
    def __init__(self,begin,end):
        threading.Thread.__init__(self)
        self.begin = begin
        self.end = end
    def run(self):
        for i in range(self.begin,self.end):
            theurl = baseurl+str(i)
            print theurl
            pageloop(theurl)            

#for i in range(10263):
    #theurl = baseurl+str(i)
    #print theurl
    #pageloop(theurl)
#如果觉得cpu转的太快的话把上面那段反注释掉
#然后吧下面注释了
if __name__ == '__main__':
    threads = []
    num = 0
    i=1
    j=100
    #for i in range(1,10263,100):

        #threads.append(getmyimg(begin, end))
    while(1):
        threads.append(getmyimg(i,j))
        i+=100
        j+=100
        if j > 10263:
            break
    for t in threads:
            t.start()
        # 等待子线程结束
    for t in threads:
        t.join()      

print "the end!!"

发表评论

电子邮件地址不会被公开。 必填项已用*标注