爬虫
bs4参考链接:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html
实例1:利用百度搜索接口,编写url采集器
import requests from bs4 import BeautifulSoup headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'} for i in range(1,100,10): bd_search="https://www.baidu.com/s?wd=inurl:/dede/login.php?&pn=%s" %str(i) r=requests.get(bd_search,headers=headers) soup=BeautifulSoup(r.text,"lxml") url_list=soup.select(".t > a") # 获得所有class为t的标签 for url in url_list: real_url=url["href"] r=requests.get(real_url) print (r.url)
实例2: 爬取文章标题
import requests import re def get_article(): for x in range(1,59): url="http://zone.secevery.com/sort_ytpe-new__day-0__is_recommend-0_page-" url=url+str(x) resp=requests.get(url) result=re.findall(br'<a href="http://zone.secevery.com/article/\d+">(.*?)</a>',resp.content) for article in result: print (article.decode('utf-8')) if __name__=='__main__': get_article()
实例3: 爬取文章标题并写入文件
#-*-coding UTF_8 -*- import requests import re from bs4 import BeautifulSoup headers={ 'Host': "zone.secevery.com", 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'Referer':'https://www.baidu.com' } def get_total_pages(): url='http://zone.secevery.com/' response=requests.get(url=url,headers=headers) soup=BeautifulSoup(response.content,'html.parser') a=soup.find_all('a',href=re.compile('is_recommend-0__page-')) a=str(a[-1]) total_pages=a.split('-')[-1].split('"')[0] return total_pages def get_total_titles(pages): titles=[] total_pages=int(pages) for page in range(1,int(total_pages)): url='http://zone.secevery.com/sort_type-new__day-0__is_recommend-0__page-%d'%page resp=requests.get(url=url,headers=headers) soup=BeautifulSoup(resp.content,'html.parser') a = soup.find_all('a',href=re.compile(r'com/article')) for i in a: if i.string != "查看全部": titles.append(i.string) return titles def write_txt(titles): with open('title1.txt','w') as f: for title in titles: f.write(title+'\n') if __name__=='__main__': print('-'*60) print("程序启动中......") print('-'*60) pages=get_total_pages() print("总页数为:%s" % pages) print('-'*60) print("正在获取数据,请稍后....") titles=get_total_titles(pages) print('文件正在写入,请稍后...') print('-'*60) write_txt(titles) print("well done!")
实例4:爬取补天公益SRC链接
import requests from requests.packages import urllib3 from bs4 import BeautifulSoup # 项目大厅链接 # https://butian.360.cn/Reward/plan # 厂商提交漏洞的链接 # https://butian.360.cn/Loo/submit?cid=61101 headers={ "Host": "butian.360.cn", "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0", "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", "Referer": "https://butian.360.cn/Reward/plan", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "X-Requested-With": "XMLHttpRequest", "Content-Length": "14", "Cookie": "test_cookie_enable=null; __huid=108OHneOmn%2FGBQ%2BJukK2rK3j4HhKMHdLMfd%2BokoEYRQKE%3D; __guid=91251416.1657029331559520500.1512282503171.2822; __DC_gid=90162694.998847747.1523438745794.1537231453981.287; smidV2=201804121919226e10578a54cc4cce100543952a4e7b000018db7a8e1407320; Q=u%3D360H2620192781%26n%3D%26le%3D%26m%3DZGHlWGWOWGWOWGWOWGWOWGWOZwHm%26qid%3D2620192781%26im%3D1_t0105d6cf9b508f72c8%26src%3Dpcw_webscan%26t%3D1; T=s%3D8b8f0bc6b26203525eb9f114ee626b0e%26t%3D1536202563%26lm%3D%26lf%3D2%26sk%3Daa11fbae76d233eeabdcbbb8abdf885b%26mt%3D1536202563%26rc%3D%26v%3D2.0%26a%3D1; __gid=67796994.160790607.1526525535847.1528973038214.14; UM_distinctid=1636c03266b578-07963af4955a558-4c312a7a-144000-1636c03266c426; __DC_monitor_count=20; PHPSESSID=ingp9jlm0d9v5tosb2do8bs8u7; __q__=1537231281479; test_cookie_enable=null; __DC_sid=138613664.439132718304839230.1537229417384.1438; wzwsconfirm=732968041ce27324ddd8dcf54d958661; wzwstemplate=Mw==; wzwschallenge=-1; ccpassport=5a26b60aa93bd3303aea3061a756590d; wzwsvtime=1537230292", } data={ "s":"1", "p":"1", "token":"", "sort":"1" } urllib3.disable_warnings() # 消除警告 #获取厂商总页数 def get_total_pages(): url="https://butian.360.cn/Reward/pub" resp=requests.post(url=url,headers=headers,data=data,verify=False) company_info=resp.json() total_pages=company_info['data']['count'] return total_pages #获取厂商ID def get_all_company_id(total_pages): company_id=[] for x in range(1,int(total_pages)+1): print ("正在获取第%d页列表" %x) data['p']=x url="https://butian.360.cn/Reward/pub" resp=requests.post(url=url,headers=headers,data=data,verify=False) info=resp.json() company_info=info['data']['list'] for company in company_info: company_id.append(company['company_id']) return company_id # 获取厂商域名 def get_all_company_host(company_id): hosts=[] print ("正在获取域名,请稍后") for x in company_id: url="https://butian.360.cn/Loo/submit?cid=%d" % int(x) resp=requests.get(url=url,headers=headers,data=data,verify=False) soup=BeautifulSoup(resp.content,"html.parser") inp=soup.find_all('input') print (inp[4]['value']) hosts.append(inp[4]['value']) return hosts # 写入txt文件 def write(hosts): with open('butian_spider.text','w') as f: for url in hosts: f.write(url+'\n') if __name__=="__main__": total_pages=get_total_pages() company_id=get_all_company_id(total_pages) hosts=get_all_company_host(company_id) print ("正在写入文件") write(hosts) print ("成功写入")