目标网站:http://movie.douban.com/top250
目标内容:
- 电影名称
- 电影信息
- 电影评分
输出结果:生成csv文件
首先settings配置
BASE_DIR = os.path.dirname(os.path.dirname(__file__))
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3128.0 Safari/537.36'
# 输出到csv文件
FEED_URI = 'file:///' + os.path.join(BASE_DIR, 'douban.csv')
FEED_FORMAT = 'CSV'
Item设置
class DoubanMovieItem(scrapy.Item):
title = scrapy.Field()
movie_info = scrapy.Field()
star = scrapy.Field()
quote = scrapy.Field()
爬虫编写
# 基本信息
class DoubantestSpider(Spider):
name = 'doubantest'
start_urls = ['https://movie.douban.com/top250']
base_url = 'https://movie.douban.com/top250'
# 内容解析
def parse(self, response):
douban_movie_item = DoubanMovieItem()
movies = response.xpath('//div[@class="info"]')
for movie in movies:
title = movie.xpath('div[@class="hd"]/a/span/text()').extract()
movie_info = movie.xpath('div[@class="bd"]/p/text()').extract()
star = movie.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()[0]
quote = movie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()
if quote:
quote = quote[0]
douban_movie_item['title'] = ''.join(title)
douban_movie_item['movie_info'] = ';'.join(movie_info)
douban_movie_item['star'] = star
douban_movie_item['quote'] = quote or ''
yield douban_movie_item
# 爬取下一页
next_link = response.xpath('//span[@class="next"]/link/@href').extract()
if next_link:
next_link = self.base_url + next_link[0]
yield Request(next_link, callback=self.parse)
运行main.py,结果