当前位置:首页 > 新闻中心 >

python google play

作者:曲靖市宁江木业有限公司 来源:www.ynnjmy.com 发布时间:2017-09-07 13:11:11

python google play


#!/usr/env python

#-*- coding: utf-8 -*-

import urllib

import urllib2

import random

import requests

import os,sys

import MySQLdb

from sgmllib import SGMLParser

from BeautifulSoup import BeautifulSoup

import re

num=0

def main():

try:

conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='googlemarket',charset="utf8")

conn.query("set names utf8")

except Exception,e:

print e

sys.exit()

cursor=conn.cursor()

category=['PERSONALIZATION','TRANSPORTATION','SPORTS','HEALTH_AND_FITNESS','APP_WALLPAPER','COMICS','MEDICAL','BUSINESS','BOOKS_AND_REFERENCE','WEATHER','ENTERTAINMENT','MEDIA_AND_VIDEO','APP_WIDGETS','TOOLS','PHOTOGRAPHY','PRODUCTIVITY','EDUCATION','NEWS_AND_MAGAZINES','TRAVEL_AND_LOCAL','LIFESTYLE','SOCIAL','FINANCE','SHOPPING','LIBRARIES_AND_DEMO','COMMUNICATION','MUSIC_AND_AUDIO','GAME']

for k in range(0,27):

t="https://play.google.com/store/apps/category/"+category[k]

html=requests.get(t)

preresult=html.content

soup=BeautifulSoup(preresult)

result=soup.prettify("utf-8")

pattern=re.compile('<a class="title" href="(.+?)" title')

dataresult=re.findall(pattern,result)

dataresult=list(set(dataresult))

for i in dataresult:

url="https://play.google.com"+i

print url

#url="https://play.google.com/store/apps/details?id=com.androidesk&hl=zh_CNhttps%3A%2F%2Fplay.google.com%2Fstore%2Fapps%2Fdetails%3Fid%3Dcom.androidesk"



html=requests.get(url)

preresult=html.content

soup=BeautifulSoup(preresult)

result=soup.prettify("utf-8")

#名称

pattern=re.compile('<div class="document-title" itemprop="name">[\s\S]*?<div>([\s\S]*?)</div>')

data0=re.findall(pattern,result)

for items in data0:

print items

#制造商

pattern=re.compile('itemprop="name">([\s\S]*?)</a>')

data1=re.findall(pattern,result)



make=data1[0].split("\n")



print make[8]

#版本

pattern=re.compile('itemprop="softwareVersion">([\s\S]*?)</div>')

data2=re.findall(pattern,result)

print data2[0]

#更新时间

pattern=re.compile('itemprop="datePublished">([\s\S]*?)</div>')

data3=re.findall(pattern,result)

print data3[0]

#文件大小

pattern=re.compile('itemprop="fileSize">([\s\S]*?)</div>')

data4=re.findall(pattern,result)

print data4[0]

#支持固件

pattern=re.compile('itemprop="operatingSystems">([\s\S]*?)</div>')

data5=re.findall(pattern,result)

print data5[0]

#说明

pattern=re.compile('itemprop="description">[\s\S]*?<div>([\s\S]*?)</div>')

data6=re.findall(pattern,result)

for items in data6:

print re.sub('[<br /> <p> </p>]',' ',items)

sql="insert into address(name,version,developer,pubtime,filesize,support,introduction) values(%s,%s,%s,%s,%s,%s,%s)"

for items in data6:



if(data5):

#values=(data0[0],data1[0],data2[0],data3[0],data4[0],data5[0],re.sub('<br />',' ',items))

#else:

#values=(data0[0],data1[0],data2[0],data3[0],data4[0],'NULL',re.sub('<br />',' ',items))

#print values

#print sql % values

#cursor.execute(sql,values)

#conn.commit()

pattern=re.compile('<img class="cover-image" src=(.+?) alt="Cover art" itemprop="image" />')

data=re.findall(pattern,result)

global num

for j in data:

print j

print type(j)

headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'}

temp=requests.get(j[1:-2], headers=headers)

f=file("googlemarket/"+str(num),"w+")

num=num+1

print num

f.write(temp.content)













if __name__=="__main__":

main()

<type 'str'>

Traceback (most recent call last):

File "crawler0729.py", line 103, in <module>

main()

File "crawler0729.py", line 91, in main

temp=requests.get(j[1:-2], headers=headers)

File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 55, in get

return request('get', url, **kwargs)

File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 44, in request

return session.request(method=method, url=url, **kwargs)

File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 335, in request

resp = self.send(prep, **send_kwargs)

File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 438, in send

r = adapter.send(request, **kwargs)

File "/usr/local/lib/python2.7/dist-packages/requests/adapters.py", line 327, in send

raise ConnectionError(e)

requests.exceptions.ConnectionError: HTTPSConnectionPool(host='lh3.ggpht.com', port=443): Max retries exceeded with url: /RBld17rLw4Ik0JtOaKk4bZB2RiGJ2R8H5Q8Rjw3Hh6BAM694fOzzKj1TJFr7R02ZS_40=w30 (Caused by <class 'socket.error'>: [Errno 101] Network is unreachable)


,站群,专题

企业建站2800元起,携手武汉肥猫科技,做一个有见地的颜值派!更多优惠请戳:天门SEO http://tianmen.raoyu.net

  • 上一篇:paip.python ide eric5-5.3.7 安装以及建立项目使用
  • 下一篇:最后一页