scrapy爬虫爬取小说

由于自己一个命名的错误,造成一直显示错误ImportError: cannot import name,而导入的错误from xiaoshuo1.items import Xiaoshuo1Item,这个从项目中导入会飘红(飘红但是可以运行)已经找到原因,是命名错误。
不多说,开始吧
本地win系统,建立独立环境

pip install virtualenv virtualenvwrapper-win
mkvirtualenv xiaoshuo1
pip install scrapy

得到项目目录:
先创建一个运行目录go.py

from scrapy.cmdline import execute
execute(['scrapy','crawl', 'xiaoshuo1'])

开始设置爬虫:
1,items.py

import scrapy

class Xiaoshuo1Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name=scrapy.Field()
    #小说名字
    author=scrapy.Field()
    #作者
    novelurl=scrapy.Field()
    #小说地址
    serialstatus=scrapy.Field()
    #状态
    serialnumber=scrapy.Field()
    #连载字数

这个目录可以定义我们要爬取的的内容
2,在spiders目录新建爬虫文件spider1.py

import re
import scrapy
from bs4 import BeautifulSoup
from scrapy.http import Request
from xiaoshuo1.items import Xiaoshuo1Item


class Myspider(scrapy.Spider):
    name = 'xiaoshuo1'#这个命名指定项目目录名
    allowed_domains = ['x23us.com']
    bash_url = 'https://www.x23us.com/class/'
    bashurl = '.html'

    def start_requests(self):
        for i in range(1, 11):
            url = self.bash_url + str(i) + '_1' + self.bashurl#真实的网址:https://www.x23us.com/class/1_1.html,i是每个栏目
            yield Request(url, self.parse)
        yield Request('https://www.x23us.com/quanben/1', self.parse)

    def parse(self, response):
        print(response.text)

到这不可以运行go.py文件,看是否能爬取到网站
接下来继续设置

import re
import scrapy
from bs4 import BeautifulSoup
from scrapy.http import Request
from xiaoshuo1.items import Xiaoshuo1Item


class Myspider(scrapy.Spider):
    name = 'xiaoshuo1'
    allowed_domains = ['23us.so']
    bash_url = 'http://www.23us.so/list/'
    bashurl = '.html'

    def start_requests(self):
        for i in range(1, 11):
            url = self.bash_url + str(i) + '_1' + self.bashurl
            yield Request(url, self.parse)
        yield Request('http://www.23us.so/full.html', self.parse)

    def parse(self, response):
        max_num=BeautifulSoup(response.text,'lxml').find('div',class_='pagelink').find_all('a')[-1].get_text()
        bashurl=str(response.url)[:-7]
        for num in range(1,int(max_num)+1):
            url=bashurl+'_'+str(num)+self.bashurl
            yield Request(url,callback=self.get_name)

    def get_name(self,response):
        tds =BeautifulSoup(response.text,'lxml').find_all('tr',bgcollor='#FFFFFF')
        for td in tds:
            novelname=td.find('a').get_text()
            novelurl=td.find('a')['href']
            yield Request(novelurl,callback=self.get_chapterurl,meta={'name':novelname,'url':novelurl})

    def get_chapterurl(self,response):
        item=Xiaoshuo1Item()
        item['name']=str(response.meta['name']).replace('\xa0','')
        item['novelurl']=response.meta['url']
        category=BeautifulSoup(response.text,'lxml').find('table').find('a').get_text()
        author=BeautifulSoup(response.text,'lxml').find('table').find_all('td')[1].get_text()
        bash_url=BeautifulSoup(response.text,'lxml').find('p',class_='btnlinks').find('a',class_='read')['href']
        name_id=str(bash_url)[-6:-1].replace('/','')
        item['category']=str(category).replace('/','')
        item['author']=str(author).replace('/','')
        item['name_id']=name_id
        return item

在项目中新建一个myslpipelines目录用于操作数据库
新建pipelines.py sql.py文件
设置pipelines.py

from .sql import Sql
from xiaoshuo1.itmes import Xiaoshuo1Item
class Xiaoshuo1Pipeline(object):
    def process_item(self,item,spider):
        if isinstance(item,Xiaoshuo1Item):
            name_id=item['name_id']
            ret = Sql.select_name(name_id)
            if ret[0]==1:
                print('已经存在')
                pass
            else:
                xs_name=item['name']
                xs_author=item['author']
                category = item['category']
                Sql.insert_dd_name(xs_name,xs_author,category,name_id)
                print('开始存标题')

设置sql.py

import mysql.connector
from xiaoshuo1 import settings
MYSQL_HOSTS=settings.MYSQL_HOSTS
MYSQL_USER=settings.MYSQL_USER
MYSQL_PASSWORD=settings.MYSQL_PASSWORD
MYSQL_PORT=settings.MYSQL_PORT
MYSQL_DB=settings.MYSQL_DB

cnx =mysql.connector.connect(user=MYSQL_USER,password=MYSQL_PASSWORD, host=MYSQL_HOSTS, database=MYSQL_DB)
cur = cnx.cursor(buffered=True)
class Sql:
    @classmethod
    def insert_dd_name(clscls,xs_name,xs_author,category,name_id):
        sql='INSERT_INTO dd_name(`xs_name`,`xs_author`,`category`,`name_id`)VALUES(%(xs_name)s,%(xs_author)s,%(category)s,%(name_id)s)'
        value={
            'xs_name':xs_name,
            'xs_author':xs_author,
            'category':category,
            'name_id':name_id
        }
        cur.execute(sql,value)
        cnx.commit()

    @classmethod
    def select_name(cls,name_id):
        sql="SELECT EXISTS(SELECT 1 FROM dd_name WHERE name_id=%(name_id)s)"
        value={
            'name_id':name_id

        }
        cur.execute(sql,value)
        return cur.fetchall()[0]

创建数据库:

DROP TABLE IF EXISTS `dd_name`;
CREATE TABLE `dd_name` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `xs_name` varchar(255) DEFAULT NULL,
  `xs_author` varchar(255) DEFAULT NULL,
  `category` varchar(255) DEFAULT NULL,
  `name_id` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4;

在settings中引入数据库

MYSQL_HOSTS='127.0.0.1'
MYSQL_USER='root'
MYSQL_PASSWORD='root'
MYSQL_PORT='3306'
MYSQL_DB='liu'

在修改pipelienes管道文件

ITEM_PIPELINES = {
    'xiaoshuo1.pipelines.Xiaoshuo1Pipeline': 1,
}

运行go.py

标签: 无

发表评论:

whatsapp营销