1. 引言
最近一直在学习数理统计基础知识,趁着5.1必须狠狠地撸一撸爬虫代码
以下代码基于Scrapy Spider的派生类Crawl Spiderde简单使用,及搭配ItemLoader,TwistedPipeline异步插入
由于本文涉及知识太多,仅供交流与阅读,并不涉及反爬,正则,清洗等相关基础知识介绍,如有不明白的还请BAIDU查找关键字
OK废话不多说,让我们进入实战阶段吧~
2. 环境及创建crawl spider
环境
win10 / python3.5 / pycharm
创建crawl spider
>>cmd
# 标的拉勾网
scrapy genspider -t crawl lagou www.lagou.com
3. Crawl Spider Code
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider,Rule
from article.items import LagouItemLoader,LagouItem
from article.utils.common import hash_md5
import datetime
class LagouSpider(CrawlSpider):
name ='lagou'
allowed_domains = ['www.lagou.com']
start_urls = ['https://www.lagou.com/']
rules = (
Rule(LinkExtractor(allow=(r'zhaopin/.*',)),follow=True),
Rule(LinkExtractor(allow=(r'gongsi/j\d\.html',)),follow=True),
Rule(LinkExtractor(allow=(r'jobs/.*',),
restrict_css=("div#s_position_list ul.item_con_list"),),
callback='parse_item',follow=False),
)
# LinkExtractor
# 作用:response对象中获取链接,并且该链接会被接下来爬取
# 使用:通过SmglLinkExtractor提取希望获取的链接
# 主要参数:
# allow:满足括号中“正则表达式”的值会被提取,如果为空,则全部匹配;
# 并跟进链接(没有callback意味着follow默认为True)
# deny:与这个正则表达式(或正则表达式列表)
# 不匹配的URL一定不提取
# allow_domains:会被提取的链接的domains
# deny_domains:一定不会被提取链接的domains
# restrict_xpaths:使用xpath表达式,和allow共同作用过滤链接
# 敲黑板!!!这里要注意了,当编写爬虫规则时,请避免使用parse作为回调函数
# CrawlSpider使用parse方法来实现其逻辑,如果您覆盖了parse方法,crawlspider将会运行失败
def parse_item(self,response):
Item_loader = LagouItemLoader(item=LagouItem(),response=response)
#在scrapy shell调试需要.extract()方法,而item_loader则不需要.extract()方法
Item_loader.add_css("title","div.job-name::attr(title)")
Item_loader.add_value("url",response.url)
Item_loader.add_value("url_object_id",hash_md5(response.url))
Item_loader.add_css("salary","span.salary::text")
Item_loader.add_xpath("job_city",".//*[@class ='job_request']/p/span[2]/text()")
Item_loader.add_xpath("work_years",".//*[@class ='job_request']/p/span[3]/text()")
Item_loader.add_xpath("degree_need",".//*[@class ='job_request']/p/span[4]/text()")
Item_loader.add_xpath("job_type",".//*[@class ='job_request']/p/span[5]/text()")
Item_loader.add_css("tags","li.labels::text")
Item_loader.add_css("publish_time","p.publish_time::text")
Item_loader.add_css("job_advantage","dd.job-advantage p::text")
Item_loader.add_css("job_desc","dd.job_bt div p::text")
Item_loader.add_css("work_addr","div.work_addr")
Item_loader.add_css("company_name","dl.job_company a img::attr(alt)")
Item_loader.add_css("company_url","dl.job_company dt a::attr(href)")
Item_loader.add_value("crawl_time",datetime.datetime.now())
# itemloader只用来编写抓取逻辑,数据清晰放在items中进行
lagou_item_loader= Item_loader.load_item()
# 通常打断点打在这里,查看参数里的values,可查看到抓取的键值
return lagou_item_loader
3. ItemLoader Code
import re
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose,TakeFirst,Join
from w3lib.html import remove_tags
from article.settings import SQL_DATETIME_FORMAT,SQL_DATE_FORMAT
import datetime
def ends_filter(value):
#拉勾网清洗函数
if "查看地图"in value:
tp_list = value.split("\n")
v_list = [v.strip()for v in tp_list if "查看地图"not in v]
return" ".join(v_list).strip()
elif "发布于拉勾网"in value:
return value.replace("发布于拉勾网","").strip()
elif "/"invalue:
return value.replace("/","").strip()
else:
return value.strip()
class LagouItemLoader(ItemLoader):
# itemloader提取默认为list,所以这里需要重筑这个类的默认值
default_output_processor = TakeFirst()
class LagouItem(scrapy.Item):
# 在scrapy.Field()里加入上面数据清洗的函数,可以接收多个函数
title = scrapy.Field()
url = scrapy.Field()
url_object_id = scrapy.Field()
salary = scrapy.Field(
input_processor=MapCompose(ends_filter)
)
job_city = scrapy.Field(
input_processor=MapCompose(ends_filter)
)
work_years = scrapy.Field(
input_processor=MapCompose(ends_filter)
)
degree_need = scrapy.Field(
input_processor=MapCompose(ends_filter)
)
job_type = scrapy.Field()
tags = scrapy.Field(
output_processor=Join(",")
)
publish_time = scrapy.Field(
input_processor=MapCompose(ends_filter)
)
job_advantage = scrapy.Field(
output_processor=Join("\n")
)
job_desc = scrapy.Field(
output_processor=Join("\n")
)
work_addr = scrapy.Field(
input_processor=MapCompose(remove_tags,ends_filter)
)
company_name = scrapy.Field()
company_url = scrapy.Field()
crawl_time = scrapy.Field()
def insert_values(self):
insert_sql ="""
insert into lagou_job(url_object_id, title, url, salary, job_city, work_years,
degree_need, job_type, publish_time, tags, job_advantage, job_desc, work_addr,
company_url, company_name, crawl_time)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, )
ON DUPLICATE KEY UPDATE salary=VALUES(salary), job_desc=VALUES(job_desc)
"""
params= (
self["url_object_id"],self["title"],self["url"],self["salary"],self["job_city"],
self["work_years"],self["degree_need"],self["job_type"],self["publish_time"],
self["tags"],self["job_advantage"],self["job_desc"],self["work_addr"],
self["company_url"],self["company_name"],
self["crawl_time"].strftime(SQL_DATETIME_FORMAT)
)
# 在此定义插入函数这样之后写的spider的pipelin就不需要去修改
return insert_sql,params
4. TwistedPipline Code
class MysqlTwistedPipline(object):
# 通用MYSQL_Pipline
def__init__(self,dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls,settings):
# 其中DB的参数在settings配置
dbparms =dict(
host= settings["MYSQL_HOST"],
db= settings["MYSQL_DBNAME"],
user= settings["MYSQL_USER"],
passwd= settings["MYSQL_PASSWORD"],
charset='utf8',
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=True,
)
dbpool = adbapi.ConnectionPool("MySQLdb",**dbparms)
return cls(dbpool)
def process_item(self,item,spider):
# 使用twisted将mysql插入变成异步执行
query =self.dbpool.runInteraction(self.do_insert,item)
query.addErrback(self.handle_error,item,spider) #处理异常
def handle_error(self,failure,item,spider):
# 处理异步插入的异常
print(failure)
def do_insert(self,cursor,item):
#执行具体的插入
#根据不同的item构建不同的sql语句并插入到mysql中
insert_sql,params =item.insert_values()
print(insert_sql,params)
cursor.execute(insert_sql,params)
return item
最后附crawl spider源码解析链接:
http://www.lai18.com/content/471040.htm