由于自己一个命名的错误,造成一直显示错误ImportError: cannot import name,而导入的错误from xiaoshuo1.items import Xiaoshuo1Item,这个从项目中导入会飘红(飘红但是可以运行)已经找到原因,是命名错误。
不多说,开始吧
本地win系统,建立独立环境
pip install virtualenv virtualenvwrapper-win
mkvirtualenv xiaoshuo1
pip install scrapy
得到项目目录:
先创建一个运行目录go.py
from scrapy.cmdline import execute
execute(['scrapy','crawl', 'xiaoshuo1'])
开始设置爬虫:
1,items.py
import scrapy
class Xiaoshuo1Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name=scrapy.Field()
#小说名字
author=scrapy.Field()
#作者
novelurl=scrapy.Field()
#小说地址
serialstatus=scrapy.Field()
#状态
serialnumber=scrapy.Field()
#连载字数
这个目录可以定义我们要爬取的的内容
2,在spiders目录新建爬虫文件spider1.py
import re
import scrapy
from bs4 import BeautifulSoup
from scrapy.http import Request
from xiaoshuo1.items import Xiaoshuo1Item
class Myspider(scrapy.Spider):
name = 'xiaoshuo1'#这个命名指定项目目录名
allowed_domains = ['x23us.com']
bash_url = 'https://www.x23us.com/class/'
bashurl = '.html'
def start_requests(self):
for i in range(1, 11):
url = self.bash_url + str(i) + '_1' + self.bashurl#真实的网址:https://www.x23us.com/class/1_1.html,i是每个栏目
yield Request(url, self.parse)
yield Request('https://www.x23us.com/quanben/1', self.parse)
def parse(self, response):
print(response.text)
到这不可以运行go.py文件,看是否能爬取到网站
接下来继续设置
import re
import scrapy
from bs4 import BeautifulSoup
from scrapy.http import Request
from xiaoshuo1.items import Xiaoshuo1Item
class Myspider(scrapy.Spider):
name = 'xiaoshuo1'
allowed_domains = ['23us.so']
bash_url = 'http://www.23us.so/list/'
bashurl = '.html'
def start_requests(self):
for i in range(1, 11):
url = self.bash_url + str(i) + '_1' + self.bashurl
yield Request(url, self.parse)
yield Request('http://www.23us.so/full.html', self.parse)
def parse(self, response):
max_num=BeautifulSoup(response.text,'lxml').find('div',class_='pagelink').find_all('a')[-1].get_text()
bashurl=str(response.url)[:-7]
for num in range(1,int(max_num)+1):
url=bashurl+'_'+str(num)+self.bashurl
yield Request(url,callback=self.get_name)
def get_name(self,response):
tds =BeautifulSoup(response.text,'lxml').find_all('tr',bgcollor='#FFFFFF')
for td in tds:
novelname=td.find('a').get_text()
novelurl=td.find('a')['href']
yield Request(novelurl,callback=self.get_chapterurl,meta={'name':novelname,'url':novelurl})
def get_chapterurl(self,response):
item=Xiaoshuo1Item()
item['name']=str(response.meta['name']).replace('\xa0','')
item['novelurl']=response.meta['url']
category=BeautifulSoup(response.text,'lxml').find('table').find('a').get_text()
author=BeautifulSoup(response.text,'lxml').find('table').find_all('td')[1].get_text()
bash_url=BeautifulSoup(response.text,'lxml').find('p',class_='btnlinks').find('a',class_='read')['href']
name_id=str(bash_url)[-6:-1].replace('/','')
item['category']=str(category).replace('/','')
item['author']=str(author).replace('/','')
item['name_id']=name_id
return item
在项目中新建一个myslpipelines目录用于操作数据库
新建pipelines.py sql.py文件
设置pipelines.py
from .sql import Sql
from xiaoshuo1.itmes import Xiaoshuo1Item
class Xiaoshuo1Pipeline(object):
def process_item(self,item,spider):
if isinstance(item,Xiaoshuo1Item):
name_id=item['name_id']
ret = Sql.select_name(name_id)
if ret[0]==1:
print('已经存在')
pass
else:
xs_name=item['name']
xs_author=item['author']
category = item['category']
Sql.insert_dd_name(xs_name,xs_author,category,name_id)
print('开始存标题')
设置sql.py
import mysql.connector
from xiaoshuo1 import settings
MYSQL_HOSTS=settings.MYSQL_HOSTS
MYSQL_USER=settings.MYSQL_USER
MYSQL_PASSWORD=settings.MYSQL_PASSWORD
MYSQL_PORT=settings.MYSQL_PORT
MYSQL_DB=settings.MYSQL_DB
cnx =mysql.connector.connect(user=MYSQL_USER,password=MYSQL_PASSWORD, host=MYSQL_HOSTS, database=MYSQL_DB)
cur = cnx.cursor(buffered=True)
class Sql:
@classmethod
def insert_dd_name(clscls,xs_name,xs_author,category,name_id):
sql='INSERT_INTO dd_name(`xs_name`,`xs_author`,`category`,`name_id`)VALUES(%(xs_name)s,%(xs_author)s,%(category)s,%(name_id)s)'
value={
'xs_name':xs_name,
'xs_author':xs_author,
'category':category,
'name_id':name_id
}
cur.execute(sql,value)
cnx.commit()
@classmethod
def select_name(cls,name_id):
sql="SELECT EXISTS(SELECT 1 FROM dd_name WHERE name_id=%(name_id)s)"
value={
'name_id':name_id
}
cur.execute(sql,value)
return cur.fetchall()[0]
创建数据库:
DROP TABLE IF EXISTS `dd_name`;
CREATE TABLE `dd_name` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`xs_name` varchar(255) DEFAULT NULL,
`xs_author` varchar(255) DEFAULT NULL,
`category` varchar(255) DEFAULT NULL,
`name_id` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4;
在settings中引入数据库
MYSQL_HOSTS='127.0.0.1'
MYSQL_USER='root'
MYSQL_PASSWORD='root'
MYSQL_PORT='3306'
MYSQL_DB='liu'
在修改pipelienes管道文件
ITEM_PIPELINES = {
'xiaoshuo1.pipelines.Xiaoshuo1Pipeline': 1,
}
运行go.py