python版本:3.5
douban_self_info.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request,FormRequest
import urllib.request
class DbSpider(scrapy.Spider):
name = "db"
allowed_domains = ["douban.com"]
# start_urls = ['http://douban.com/']
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"}
def start_requests(self):
return [Request("https://accounts.douban.com/login",
callback=self.parse,
meta={
"cookiejar":1
}
)]
def parse(self, response):
captcha = response.xpath("//img[@id='captcha_image']/@src").extract()
if len(captcha) > 0:
print("此时有验证码")
localpath = "E:/pictest/captchar.jpg"
urllib.request.urlretrieve(captcha[0],filename=localpath)
print("请查看本地验证码图片并输入验证码")
captcha_value = input()
data = {
"form_email": "xxxxx",
"form_password": "xxxxx",
"captcha-solution":str(captcha_value),
"redir": "https://www.douban.com/people/82984134/" # 登录后要返回的页面
}
else:
print("此时没有验证码")
data = {
"form_email":"xxxxx",
"form_password":"xxxxx",
"redir":"https://www.douban.com/people/82984134/"#登录后要返回的页面
}
print("登录中。。。。。。")
return [FormRequest.from_response(response,
meta={"cookiejar":response.meta["cookiejar"]},
headers=self.headers,
formdata=data,
callback=self.next,
)]
def next(self,response):
print("此时已经登录完成并爬取了个人中心的数据")
title = response.xpath("/html/head/title/text()").extract()
# db_id = response.xpath("//div[@class='infobox']/div[@class='p1']/text()").extract()
print(title[0])
# print(db_id[0])