目标:抓取的内容组织成如下字典形式
[{\'url\' : \'xxxxxxxxx\'
\'price\' : \'xxx\'
\'id\' : \'xxx\'
\'img\' : \'xxx\'
\'title\' : \'xxx\'
}#宝贝1
{\'url\' : \'xxxxxxxxx\'
\'price\' : \'xxx\'
\'id\' : \'xxx\'
\'img\' : \'xxx\'
\'title\' : \'xxx\'
}#宝贝2
{\'url\' : \'xxxxxxxxx\'
\'price\' : \'xxx\'
\'id\' : \'xxx\'
\'img\' : \'xxx\'
\'title\' : \'xxx\'
}#宝贝3
........
]
分析及组织访问网址
字段以键对值的形式表示,其中psort
表示产品以某种形式进行排序,page
表示当前是第几页,keyword
表示搜索产品的关键词,可以通过建立一个函数,包含此三个参数来构建访问网址,以此达到抓取特定产品信息的目的
import urlparse
import urllib
from bs4 import BeautifulSoup
url = \"http://search.jd.com/Search?keyword=BOSE&enc=utf-8&qr=%E5%B7%B4%E5%A1%9E&qrst=correct&et=4&rt=1&stop=1&click=&psort=3\"
parsed = urlparse.urlparse(url)
d = urlparse.parse_qs(parsed.query,True)
\"\"\"
{\'qrst\': [\'correct\'],
\'rt\': [\'1\'],
\'enc\': [\'utf-8\'],
\'keyword\': [\'BOSE\'],
\'stop\': [\'1\'],
\'psort\': [\'3\'],
\'qr\': [\'\\xe5\\xb7\\xb4\\xe5\\xa1\\x9e\'],
\'et\': [\'4\'],
\'page\':[\'1\']
\'click\': [\'\']}
\"\"\"
构建函数访问特定网址
def get_content_from_jd(keyword = \'Bose\', page = \'\', product_sort = \'\'):
params = {\'qrst\': \'correct\',
\'rt\': \'1\',
\'enc\': \'utf-8\',
\'keyword\': keyword,
\'stop\': \'1\',
\'psort\': product_sort,
\'et\': \'4\',
\'page\': page,
\'click\': \'\'} #建立参数,其中keyword,psort,page通过函数参数传入
html = urllib.urlencode(params)
url = \'http://search.jd.com/Search?\' + html #构建访问网址
html_content = urllib.urlopen(url) #抓取网址内容
content = html_content.read()
html_content.close()
return content
抓取商品ID
if __name__ == \'__main__\':
bose_info = get_content_from_jd(\'Bose\', \'1\', \'3\') #keyword = Bose, page = 1, product_sort = 3 3为按销量排序
bose_content = BeautifulSoup(bose_info)
bose_div = bose_content.find_all(\'div\', class_ = \'lh-wrap\') #每件商品以lh-wrap关键字区分
# #id号以strong区分
for i in bose_div:
print i.strong[\'class\'][0]
返回内容
J_1150767
J_1225287
J_1225340
J_1150768
J_1150841
J_1237666
J_1150698
J_1237665
J_1150821
J_1150753
J_1237663
J_1182133
J_1150733
J_1253123
J_1253126
J_1150701
J_1291112
J_1253232
J_1150689
J_1150690
J_1150699
J_1150704
J_1255545
J_1237659
J_1253213
J_1253226
J_1150759
J_1150760
J_1150766
J_1291110