Python3-学习（爬虫实例）

爬虫

bs4参考链接：https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html

实例1：利用百度搜索接口，编写url采集器

import requests
from bs4 import BeautifulSoup

headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'}

for i in range(1,100,10):
    bd_search="https://www.baidu.com/s?wd=inurl:/dede/login.php?&pn=%s" %str(i)
    r=requests.get(bd_search,headers=headers)
    soup=BeautifulSoup(r.text,"lxml")
    url_list=soup.select(".t > a")    # 获得所有class为t的标签
    for url in url_list:
        real_url=url["href"]
        r=requests.get(real_url)
        print (r.url)

实例2：爬取文章标题

import requests
import re

def get_article():
    for x in range(1,59):
        url="http://zone.secevery.com/sort_ytpe-new__day-0__is_recommend-0_page-"
        url=url+str(x)
        resp=requests.get(url)
        result=re.findall(br'<a href="http://zone.secevery.com/article/\d+">(.*?)</a>',resp.content)
        for article in result:
            print (article.decode('utf-8'))

if __name__=='__main__':
    get_article()

实例3：爬取文章标题并写入文件

#-*-coding UTF_8 -*-
import requests
import re
from bs4 import BeautifulSoup

headers={
    'Host': "zone.secevery.com",
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    'Referer':'https://www.baidu.com'
}

def get_total_pages():
    url='http://zone.secevery.com/'
    response=requests.get(url=url,headers=headers)
    soup=BeautifulSoup(response.content,'html.parser')

    a=soup.find_all('a',href=re.compile('is_recommend-0__page-'))
    a=str(a[-1])
    total_pages=a.split('-')[-1].split('"')[0]
    return  total_pages

def get_total_titles(pages):
    titles=[]
    total_pages=int(pages)
    for page in range(1,int(total_pages)):
        url='http://zone.secevery.com/sort_type-new__day-0__is_recommend-0__page-%d'%page
        resp=requests.get(url=url,headers=headers)
        soup=BeautifulSoup(resp.content,'html.parser')
        a = soup.find_all('a',href=re.compile(r'com/article'))
        for i in a:
            if i.string != "查看全部":
                titles.append(i.string)
    return titles

def write_txt(titles):
    with open('title1.txt','w') as f:
        for title in titles:
            f.write(title+'\n')

if __name__=='__main__':
    print('-'*60)
    print("程序启动中......")
    print('-'*60)
    pages=get_total_pages()
    print("总页数为：%s" % pages)
    print('-'*60)
    print("正在获取数据，请稍后....")
    titles=get_total_titles(pages)
    print('文件正在写入,请稍后...')
    print('-'*60)
    write_txt(titles)
    print("well done!")

实例4：爬取补天公益SRC链接

import requests
from requests.packages import urllib3
from bs4 import BeautifulSoup
# 项目大厅链接
# https://butian.360.cn/Reward/plan

# 厂商提交漏洞的链接
# https://butian.360.cn/Loo/submit?cid=61101

headers={
    "Host": "butian.360.cn",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0",
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
    "Referer": "https://butian.360.cn/Reward/plan",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "X-Requested-With": "XMLHttpRequest",
    "Content-Length": "14",
    "Cookie": "test_cookie_enable=null; __huid=108OHneOmn%2FGBQ%2BJukK2rK3j4HhKMHdLMfd%2BokoEYRQKE%3D; __guid=91251416.1657029331559520500.1512282503171.2822; __DC_gid=90162694.998847747.1523438745794.1537231453981.287; smidV2=201804121919226e10578a54cc4cce100543952a4e7b000018db7a8e1407320; Q=u%3D360H2620192781%26n%3D%26le%3D%26m%3DZGHlWGWOWGWOWGWOWGWOWGWOZwHm%26qid%3D2620192781%26im%3D1_t0105d6cf9b508f72c8%26src%3Dpcw_webscan%26t%3D1; T=s%3D8b8f0bc6b26203525eb9f114ee626b0e%26t%3D1536202563%26lm%3D%26lf%3D2%26sk%3Daa11fbae76d233eeabdcbbb8abdf885b%26mt%3D1536202563%26rc%3D%26v%3D2.0%26a%3D1; __gid=67796994.160790607.1526525535847.1528973038214.14; UM_distinctid=1636c03266b578-07963af4955a558-4c312a7a-144000-1636c03266c426; __DC_monitor_count=20; PHPSESSID=ingp9jlm0d9v5tosb2do8bs8u7; __q__=1537231281479; test_cookie_enable=null; __DC_sid=138613664.439132718304839230.1537229417384.1438; wzwsconfirm=732968041ce27324ddd8dcf54d958661; wzwstemplate=Mw==; wzwschallenge=-1; ccpassport=5a26b60aa93bd3303aea3061a756590d; wzwsvtime=1537230292",
}

data={
    "s":"1",
    "p":"1",
    "token":"",
    "sort":"1"
}

urllib3.disable_warnings()    # 消除警告

#获取厂商总页数
def get_total_pages():
    url="https://butian.360.cn/Reward/pub"
    resp=requests.post(url=url,headers=headers,data=data,verify=False)
    company_info=resp.json()
    total_pages=company_info['data']['count']
    return total_pages

#获取厂商ID
def get_all_company_id(total_pages):
    company_id=[]
    for x in range(1,int(total_pages)+1):
        print ("正在获取第%d页列表" %x)
        data['p']=x
        url="https://butian.360.cn/Reward/pub"
        resp=requests.post(url=url,headers=headers,data=data,verify=False)
        info=resp.json()
        company_info=info['data']['list']
        for company in company_info:
            company_id.append(company['company_id'])
    return company_id

# 获取厂商域名
def get_all_company_host(company_id):
    hosts=[]
    print ("正在获取域名，请稍后")
    for x in company_id:
        url="https://butian.360.cn/Loo/submit?cid=%d" % int(x)
        resp=requests.get(url=url,headers=headers,data=data,verify=False)
        soup=BeautifulSoup(resp.content,"html.parser")
        inp=soup.find_all('input')
        print (inp[4]['value'])
        hosts.append(inp[4]['value'])
    return hosts

# 写入txt文件
def write(hosts):
    with open('butian_spider.text','w') as f:
        for url in hosts:
            f.write(url+'\n')

if __name__=="__main__":
    total_pages=get_total_pages()
    company_id=get_all_company_id(total_pages)
    hosts=get_all_company_host(company_id)
    print ("正在写入文件")
    write(hosts)
    print ("成功写入")