# requests模块
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 import requests session = requests.session() url = 'https://www.douban.com/accounts/login' form_data = { 'source' : 'index_nav' , 'form_email' : 'xxx' , 'form_password' : 'xxx' , 'captcha-solution' : 'stamp' , 'captcha-id' : 'b3dssX515MsmNaklBX8uh5Ab:en' } req_header = { 'User-Agent' :'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' , } response = session.post(url,headers=req_header,data=form_data) if response.status_code == 200 : url = 'https://www.douban.com/people/175417123/' response = session.get(url,headers = req_header) if response.status_code == 200 : with open('douban3.html' ,'w' ) as file: file.write(response.text)
1 2 3 4 5 6 7 8 9 10 import requestsimport timemycookie = { "PHPSESSID" :"56v9clgo1kdfo3q5q8ck0aaaaa" } x = requests.session() requests.utils.add_dict_to_cookiejar(x.cookies,{"PHPSESSID" :"07et4ol1g7ttb0bnjmbiqjhp43" }) x.get("http://127.0.0.1:80" ,cookies = mycookie) time.sleep(5 ) x.get("http://127.0.0.1:80" )
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 r = requests.get('https://github.com/timeline.json' ) r = requests.post("http://m.ctrip.com/post" ) r = requests.put("http://m.ctrip.com/put" ) r = requests.delete("http://m.ctrip.com/delete" ) r = requests.head("http://m.ctrip.com/head" ) r = requests.options("http://m.ctrip.com/get" ) print r.content print r.text payload = {'keyword' : '日本' , 'salecityid' : '2' } r = requests.get("http://m.ctrip.com/webapp/tourvisa/visa_list" , params=payload) print r.url r = requests.get('https://github.com/timeline.json' ) print r.encodingr.encoding = 'utf-8' r = requests.get('https://github.com/timeline.json' ) print r.json() url = 'http://m.ctrip.com' headers = {'User-Agent' : 'Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 4 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19' } r = requests.post(url, headers=headers) print r.request.headersurl = 'http://m.ctrip.com' payload = {'some' : 'data' } r = requests.post(url, data=json.dumps(payload)) url = 'http://m.ctrip.com' files = {'file' : open('report.xls' , 'rb' )} r = requests.post(url, files=files) r = requests.get('http://m.ctrip.com' ) print r.status_code r = requests.get('http://m.ctrip.com' ) print r.headersprint r.headers['Content-Type' ]print r.headers.get('content-type' ) url = 'http://example.com/some/cookie/setting/url' r = requests.get(url) r.cookies['example_cookie_name' ] url = 'http://m.ctrip.com/cookies' cookies = dict(cookies_are='working' ) r = requests.get(url, cookies=cookies) r = requests.get('http://m.ctrip.com' , timeout=0.001 ) proxies = { "http" : "http://10.10.10.10:8888" , "https" : "http://10.10.10.100:4444" , } r = requests.get('http://m.ctrip.com' , proxies=proxies)
数据解析 正则 xpath bs4 动态加载数据的爬取 selenium 环境搭建错误
移动端数据的爬取 -异步的爬虫 -10种反爬机制 -scrapy框架 -异步的爬虫框架
异步爬虫 1 2 3 4 from multiprocessing.dummy import Poolmap( callback,alist)
Scrapy 1 2 3 4 5 6 7 创建项目:scrapy startproject xxx 进入项目:cd xxx #进入某个文件夹下 创建爬虫:scrapy genspider xxx(爬虫名) xxx.com (爬取域) #爬虫名字不能和项目名一样 生成文件:scrapy crawl xxx -o xxx.json (生成某种类型的文件) 运行爬虫:scrapy crawl XXX 列出所有爬虫:scrapy list 获得配置信息:scrapy settings [options]。
xpath get == extract_first
持久化存储 pipelines.py 参数为item项
持久化存储
从写父类:
def open_spider(self,spider):
def close_spider(self,spider):
开始执行一次
定义多个类
持久化存储 基于终端指令的持久化存储 要求:该种方式只可以将parse方法的返回值存储到本地指定后缀的文本文件中。 执行指令: scrapy crawl spiderName -0 fi LePath 基于管道的持久化存储(重点) -在爬虫文件中进行数据解析 在items。py中定义相关属性 -步骤1中解析出了几个字段的数据,在此就定义几个属性 在爬虫文件中将解析到的数据存储封装到item类型的对象中 将Item类型的对象提交给管道 在管道文件(pipelines.py) 中,接收爬虫文件提交过来的Item类型对象,且对其进行任意形式的 持久化存储操作 -在配置文件中开启管道机制
手动请求 1 2 yield : scrapy.Rquest(url=url,callback=self.xxx,meta={'666' :'666' }) Form 为post
下载中间件
文件下 xxxPipeline
管道类提供文件下载
大文件下载 下属管道类是sC rapy封装好的我们直接用即可: from scrapy. pipelines.images import ImagesPipeline -重写该管道类的三个方法:
get_ media requests -对图片地址发起请求 file_ path -返回图片名称即可 item completed -返回item,将其返回给下一个即将被执行的管道类 在配置文件中添加 IMAGES_ STORE = ‘ dirName ‘
模拟登陆 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 class PosttestSpider (scrapy.Spider ): name = 'posttest' start_urls = ['http://vip.xxhoz.com/ucenter/login.php' ] cookies = { 'Cookie' : 'UM_distinctid=171a6c4c9ca280-0c0b194dcd55b9-87f133f-144000-171a6c4c9cb487; PHPSESSID=vgmt904t67ha9mv96oph2q45tk; CNZZDATA1278827165=2035019748-1587636301-null%7C1602920955' } def start_requests (self ): start_url ='http://vip.xxhoz.com/ucenter/login.php' yield scrapy.Request(start_url,callback=self.parse,cookies=self.cookies) def parse (self, response ): yield scrapy.FormRequest.from_response( response, formdata={ 'u_name' :'admin' , 'u_password' :'admin' }, callback=self.f ) def f (self, response ): print(response.text) def parse (self, response ): yield scrapy.FormRequest( url='http://vip.xxhoz.com/ucenter/login.php' , formdata={ 'u_name' : 'admin' , 'u_password' :'admin' }, callback=self.f ) def f (self, response ): print(response.text)