运行版本:
Python 3.7.0
完整代码如下:
# -*- coding: utf-8 -*-
"""
@author:lee
@create_time:2018/10/18 11:42
"""
from bs4 import BeautifulSoup
import requests
import bs4
import pymysql.cursors
def gethtml(url,headers):
response = requests.get(url,headers=headers)
try:
if response.status_code == 200:
print('抓取成功网页长度:',len(response.text))
response.encoding = 'utf-8'
return response.text
except BaseException as e:
print('抓取出现错误:',e)
def getsoup(html,list):
soup = BeautifulSoup(html,'lxml')
for dd in soup.find_all('dd'):
if isinstance(dd,bs4.element.Tag):
top = dd.i.string #获取排名
name = dd.find('p',class_='name').string
replease_times = dd.find('p',class_="releasetime").string
s = dd.find('p',class_="score").contents
score = s[0].string+s[1].string
list.append([top,name,replease_times,score])
def write_sql(data):
conn = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spiders')
cur = conn.cursor()
for i in data:
movie = i
sql = 'INSERT INTO maoyan(top,name,replease_times,score) VALUES (%s,%s,%s,%s)'
try:
cur.execute(sql,movie)
conn.commit()
print('写入成功')
except BaseException as e:
print('导入失败',e)
conn.rollback()
conn.close()
def main():
start_url = 'http://maoyan.com/board/4'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
depth = 10
for i in range(depth):
url = start_url+'?offset='+str(10 * i)
html = gethtml(url, headers)
list = []
getsoup(html, list)
write_sql(list)
print(list)
if __name__ == '__main__':
main()
运行结果: