Python3-学习(爬虫实例)

爬虫
  • bs4参考链接:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html

  • 实例1:利用百度搜索接口,编写url采集器

    import requests
    from bs4 import BeautifulSoup
    
    headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'}
    
    for i in range(1,100,10):
        bd_search="https://www.baidu.com/s?wd=inurl:/dede/login.php?&pn=%s" %str(i)
        r=requests.get(bd_search,headers=headers)
        soup=BeautifulSoup(r.text,"lxml")
        url_list=soup.select(".t > a")    # 获得所有class为t的标签
        for url in url_list:
            real_url=url["href"]
            r=requests.get(real_url)
            print (r.url)
    
  • 实例2: 爬取文章标题

    import requests
    import re
    
    def get_article():
        for x in range(1,59):
            url="http://zone.secevery.com/sort_ytpe-new__day-0__is_recommend-0_page-"
            url=url+str(x)
            resp=requests.get(url)
            result=re.findall(br'<a href="http://zone.secevery.com/article/\d+">(.*?)</a>',resp.content)
            for article in result:
                print (article.decode('utf-8'))
    
    if __name__=='__main__':
        get_article()
    
  • 实例3: 爬取文章标题并写入文件

    #-*-coding UTF_8 -*-
    import requests
    import re
    from bs4 import BeautifulSoup
    
    headers={
        'Host': "zone.secevery.com",
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        'Referer':'https://www.baidu.com'
    }
    
    def get_total_pages():
        url='http://zone.secevery.com/'
        response=requests.get(url=url,headers=headers)
        soup=BeautifulSoup(response.content,'html.parser')
    
        a=soup.find_all('a',href=re.compile('is_recommend-0__page-'))
        a=str(a[-1])
        total_pages=a.split('-')[-1].split('"')[0]
        return  total_pages
    
    def get_total_titles(pages):
        titles=[]
        total_pages=int(pages)
        for page in range(1,int(total_pages)):
            url='http://zone.secevery.com/sort_type-new__day-0__is_recommend-0__page-%d'%page
            resp=requests.get(url=url,headers=headers)
            soup=BeautifulSoup(resp.content,'html.parser')
            a = soup.find_all('a',href=re.compile(r'com/article'))
            for i in a:
                if i.string != "查看全部":
                    titles.append(i.string)
        return titles
    
    def write_txt(titles):
        with open('title1.txt','w') as f:
            for title in titles:
                f.write(title+'\n')
    
    if __name__=='__main__':
        print('-'*60)
        print("程序启动中......")
        print('-'*60)
        pages=get_total_pages()
        print("总页数为:%s" % pages)
        print('-'*60)
        print("正在获取数据,请稍后....")
        titles=get_total_titles(pages)
        print('文件正在写入,请稍后...')
        print('-'*60)
        write_txt(titles)
        print("well done!")
    
  • 实例4:爬取补天公益SRC链接

    import requests
    from requests.packages import urllib3
    from bs4 import BeautifulSoup
    # 项目大厅链接
    # https://butian.360.cn/Reward/plan
    
    # 厂商提交漏洞的链接
    # https://butian.360.cn/Loo/submit?cid=61101
    
    headers={
        "Host": "butian.360.cn",
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0",
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
        "Referer": "https://butian.360.cn/Reward/plan",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "X-Requested-With": "XMLHttpRequest",
        "Content-Length": "14",
        "Cookie": "test_cookie_enable=null; __huid=108OHneOmn%2FGBQ%2BJukK2rK3j4HhKMHdLMfd%2BokoEYRQKE%3D; __guid=91251416.1657029331559520500.1512282503171.2822; __DC_gid=90162694.998847747.1523438745794.1537231453981.287; smidV2=201804121919226e10578a54cc4cce100543952a4e7b000018db7a8e1407320; Q=u%3D360H2620192781%26n%3D%26le%3D%26m%3DZGHlWGWOWGWOWGWOWGWOWGWOZwHm%26qid%3D2620192781%26im%3D1_t0105d6cf9b508f72c8%26src%3Dpcw_webscan%26t%3D1; T=s%3D8b8f0bc6b26203525eb9f114ee626b0e%26t%3D1536202563%26lm%3D%26lf%3D2%26sk%3Daa11fbae76d233eeabdcbbb8abdf885b%26mt%3D1536202563%26rc%3D%26v%3D2.0%26a%3D1; __gid=67796994.160790607.1526525535847.1528973038214.14; UM_distinctid=1636c03266b578-07963af4955a558-4c312a7a-144000-1636c03266c426; __DC_monitor_count=20; PHPSESSID=ingp9jlm0d9v5tosb2do8bs8u7; __q__=1537231281479; test_cookie_enable=null; __DC_sid=138613664.439132718304839230.1537229417384.1438; wzwsconfirm=732968041ce27324ddd8dcf54d958661; wzwstemplate=Mw==; wzwschallenge=-1; ccpassport=5a26b60aa93bd3303aea3061a756590d; wzwsvtime=1537230292",
    }
    
    data={
        "s":"1",
        "p":"1",
        "token":"",
        "sort":"1"
    }
    
    urllib3.disable_warnings()    # 消除警告
    
    #获取厂商总页数
    def get_total_pages():
        url="https://butian.360.cn/Reward/pub"
        resp=requests.post(url=url,headers=headers,data=data,verify=False)
        company_info=resp.json()
        total_pages=company_info['data']['count']
        return total_pages
    
    #获取厂商ID
    def get_all_company_id(total_pages):
        company_id=[]
        for x in range(1,int(total_pages)+1):
            print ("正在获取第%d页列表" %x)
            data['p']=x
            url="https://butian.360.cn/Reward/pub"
            resp=requests.post(url=url,headers=headers,data=data,verify=False)
            info=resp.json()
            company_info=info['data']['list']
            for company in company_info:
                company_id.append(company['company_id'])
        return company_id
    
    # 获取厂商域名
    def get_all_company_host(company_id):
        hosts=[]
        print ("正在获取域名,请稍后")
        for x in company_id:
            url="https://butian.360.cn/Loo/submit?cid=%d" % int(x)
            resp=requests.get(url=url,headers=headers,data=data,verify=False)
            soup=BeautifulSoup(resp.content,"html.parser")
            inp=soup.find_all('input')
            print (inp[4]['value'])
            hosts.append(inp[4]['value'])
        return hosts
    
    # 写入txt文件
    def write(hosts):
        with open('butian_spider.text','w') as f:
            for url in hosts:
                f.write(url+'\n')
    
    if __name__=="__main__":
        total_pages=get_total_pages()
        company_id=get_all_company_id(total_pages)
        hosts=get_all_company_host(company_id)
        print ("正在写入文件")
        write(hosts)
        print ("成功写入")