python 爬虫
为什么会想到,用Python去爬取QQ空间相册呢,而且还要区分是照片,还是视频呢,就因太多了,这里回想当时,是多么想哭,还好,我是一个技术宅,还是个汇在通信圈里的技术宅,因此查资料,琢磨出来这大篇幅的脚本,今天代码出现了幺儿子,修正了代码,又可以执行了,现在分享给有需要的人,希望有人能关注,一块交流交流。
先简单说下,技术要点吧。
原理就是利用,python + selenium + chromedrive 模拟手机浏览器,获取相应的COOK,爬虫的一些简单的应用,下面是简单教程(这里是以你电脑已装了Python3.6+):
1)直接使用pip安装
pip3 install selenium
2)下载chromedriver 和chrome浏览器,这里要注意的是,chromedriver 和chrome浏览器是有版本对应的,也就是也什么版本的chrome浏览器,对应着不同的chromedriver ,下面是最新的对应表,直接用就可以了。
ChromeDriver v74.0.3729.6 (2019-03-14)----------Supports Chrome v74
下载地址:
http://npm.taobao.org/mirrors/chromedriver/
这两样都下载了,然后,把 chromedriver放在相应的浏览器安装目录下,
好了,做好这两个步就完成了**了,下面直接分享代码:
#!coding:utf-8#ver:v2.1版本from selenium import webdriverimport requests,time,json,os,urllib,urllib3,logging#记录日志logging.basicConfig(filename='example.log', filemode="w", level=logging.DEBUG)logging.debug('This message should go to the log file')logging.info('So should this')logging.warning('And this, too')global dlurl,dlpicname#设置下载后存放的存储路径'global path,dest_dir,dlpsnumdlpsnum=500 #下载分页数path =r'E:\NONO' #下载的文件保存目录#登陆信息login_uin = 'XXXXXXXX' #登录qqpwd = 'XXXXXXXX' #登录密码# input_pwd = input('请输入密码:') #登录密码# pwd =input_pwdalbum_uin = 'XXXXXX' #要读取相册的qqs = requests.Session()#实例化出浏览器开始登录#设置手机型号mobileEmulation = { "deviceName": "Nexus 5" }options = webdriver.ChromeOptions()options.add_experimental_option('mobileEmulation', mobileEmulation)#启动driver#以下假定你的Chrome安装目录为:"C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe",chrome_options=options)driver = webdriver.Chrome(executable_path=r"C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe",chrome_options=options)#访问#driver = webdriver.Chrome("C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")#driver.set_window_size(1000,600)driver.get('https://mobile.qzone.qq.com')driver.find_element_by_id('u').clear()driver.find_element_by_id('u').send_keys(login_uin)driver.find_element_by_id('p').clear()driver.find_element_by_id('p').send_keys(pwd)driver.find_element_by_id('go').click()#等待浏览器中js计算出qzonetokenwhile True: qzonetoken = driver.execute_script("return window.shine0callback") if qzonetoken: break time.sleep(0.1)#读取cookie后关闭浏览器cookies = driver.get_cookies()driver.quit()cookies_ = {}for cookie in cookies: if cookie['name'] == 'p_skey': skey = cookie['value'] #s.cookies.set(cookie['name'], cookie['value']) cookies_[cookie['name']] = cookie['value']#计算gtke = 5381for i in range(len(skey)): e = e + (e<<5)+ord(skey[i])g_tk = str(2147483647 & e)#请求中添加cookie,开始读取相册列表requests.utils.add_dict_to_cookiejar(s.cookies, cookies_)url="https://mobile.qzone.qq.com/list?qzonetoken="+qzonetoken+"&g_tk="+g_tk+"&format=json&list_type=album&action=0&res_uin="+album_uin+"&count=1"r = s.get(url);data = json.loads(r.text.encode('utf-8'))def getPic(psid): while psid>=0: print(psid) #读取当前相册中的图片列表 # https://h5.qzone.qq.com/mqzone/profile?starttime=1511970032161&hostuin=185763858#185763858/list/album?starttime=1511970041906 #url = "https://h5.qzone.qq.com/webapp/json/mqzone_photo/getPhotoList2?qzonetoken="+qzonetoken+"&g_tk="+g_tk+"&uin="+album_uin+"&album&ps=0" url = "https://h5.qzone.qq.com/webapp/json/mqzone_photo/getPhotoList2?qzonetoken="+qzonetoken+"&g_tk="+g_tk+"&uin="+album_uin+"&album&ps="+str(psid) print("qzonetoken :"+qzonetoken) r = s.get(url) photo_datas = json.loads(r.text.encode('utf-8')) # for T in photo_datas['data']['photos']: # for pic in photo_datas['data']['photos'][T]: # # print ('图片名:'+pic['picname'].encode('utf-8')+',url:'+pic['1']['url'].encode('utf-8')) # print('图片名:' + pic['picname']+ ',url:' + pic['1']['url']) for T in photo_datas['data']['photos']: for pic in photo_datas['data']['photos'][T]: # print ('图片名:'+pic['picname'].encode('utf-8')+',url:'+pic['1']['url'].encode('utf-8')) # print('图片名:' + pic['shoottime']+ ',url:' + pic['videodata']['videourl']) # print('视频名:' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(pic['shoottime'])) + ',url:' + pic['videodata']['videourl']) if pic['videodata']['videourl'].strip(): dlpicname = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(pic['shoottime'])).replace(':', '_')#+'_'+pic['videodata']['videoid'] dlurl = pic['videodata']['videourl'] qvidfilename =dlpicname + '.mp4' #如果有文件名相同的跳过循环 if qvidfilename in os.listdir(path): continue dest_dir = os.path.join(path, qvidfilename) print('视频名' + dlpicname + ', url:' + dlurl) #urllib.request.urlretrieve(dlurl, dest_dir) # cdata = urllib.request.Request(dlurl, headers=qqheaders) # urllib.request.urlretrieve(cdata, dest_dir) # downLoadFileFromUrl(dest_dir,dlurl) opener = urllib.request.build_opener() opener.addheaders =[('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')] urllib.request.install_opener(opener) urllib.request.urlretrieve( dlurl, dest_dir) # print( '视频名'+ time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(pic['shoottime']))+', url:' +pic['videodata']['videourl']) else: dlpicname = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(pic['shoottime'])).replace(':', '_') dlurl = pic['1']['url'] qpicfilename=dlpicname + '.jpg' # 如果有文件名相同的跳过循环 if qpicfilename in os.listdir(path): continue dest_dir = os.path.join(path, qpicfilename) print('图片名' + dlpicname + ', url:' + dlurl) # cdata = urllib.request.Request(dlurl, headers=qqheaders) # urllib.request.urlretrieve(cdata, dest_dir) opener = urllib.request.build_opener() opener.addheaders =[('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')] urllib.request.install_opener(opener) urllib.request.urlretrieve( dlurl, dest_dir) # downLoadFileFromUrl(dest_dir, dlurl) # print('图片名:' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(pic['shoottime']))+ ',url:' + pic['1']['url']) # print(photo_datas) psid=psid-20 print ("="*10)for album in data['data']['vFeeds']: #print ('相册名:'+album['pic']['albumname'].encode('utf-8')) print('相册名:' + album['pic']['albumname']) #print ('相册id:'+album['pic']['albumid'].encode('utf-8')) print('相册id:' + album['pic']['albumid']) #print ('图片数量:' + str(album['pic']['albumnum'])) print('图片数量:' + str(album['pic']['albumnum'])) print ('开始下载相册图片:') getPic(dlpsnum)
QQ空间相册
通过以上的努力,终于可以把把QQ里的几千络宝宝照片,下载下来了,相当的高兴,不是吗
#备注:以上获取QQzone Key等**,来源于互联网。