>

获取知乎热点新闻,爬虫爬取腾讯新闻科技类的

- 编辑:www.bifa688.com -

获取知乎热点新闻,爬虫爬取腾讯新闻科技类的

递归获取网站的链接抽出图片

python 爬虫爬取腾讯情报科学和技术类的企鹅智酷连串(1)

废话相当少说,间接贴代码,首要行使BeautifulSoup写的

 

# -*- coding: utf-8 -*-

"""

Created on Mon May 18 19:12:06 2015

 

@author: Administrator

"""

 

import urllib

import os

from bs4 import BeautifulSoup

import sys

reload(sys)

sys.setdefaultencoding("utf-8")

 

i = 0

j = 0

list_a = []

 

def gettext(href):

global j,list_a

page = urllib.urlopen(href).read()

soup = BeautifulSoup(page,from_encoding="gb18030")

div = soup.find_all("div",class_="content")

p_text = div[0].find_all("p")

for p in p_text:

fp = file("%s.txt" % list_a[j],"a")

fp.write(' ')

fp.write(p.get_text())

fp.write(" n")

j =1

 

def gethref(url): #获得全体链接

global i,list_a

fp = file("AllTitle.txt","w ")

page = urllib.urlopen(url).read()

soup = BeautifulSoup(page,from_encoding="gb18030")

ul = soup.find_all("ul",class_="row1")

li = ul[0].find_all("li")

for lia in li:

list_a.append(("%s、" % (i 1)) lia.h3.get_text())

href = lia.a.get('href')

# 将标题简要介绍和链接有平整的写入文件中

fp.write("%s、" % (i 1))

i =1

fp.write("标题:")

fp.write(lia.h3.get_text())

fp.write("n 简介:")

fp.write(lia.p.get_text())

fp.write("n 链接:")

fp.write(lia.a.get("href"))

fp.write("n")

 

gettext(href)

 

if "__main__"==__name__:

url =""

gethref(url)

print "All Is OK!"

爬虫爬取Tencent信息科学技术类的企鹅智酷连串(1) 废话非常的少说,直接贴代码,首要采取BeautifulSoup写的 # -*- coding: utf-8 -*- Created on Mon May...

Python 2.7 遵照程序提醒 输入账号密码之后 能够得到到博客园火爆信息的标题链接。若是想博得天涯论坛其余新闻可以自行修改。

 

直白上代码啦

<code>
import re
import requests
import cookielib
from PIL import Image
import time
import json
import webbrowser
from attr import attrib
from lxml import etree
import urllib2
import urlparse
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}
filename = 'cookie'
session = requests.Session()
session.cookies = cookielib.CookieJar()
try:
session.cookies.load(filename=filename, ignore_discard=True)
except:
print('cookie fail')
# <input type="hidden" name="_xsrf" value="f1f90f1cfe8ec5c732ef0d8833ccabe8"/>
def get_xsrf():
response = session.get('https://www.zhihu.com', headers=headers)
html = response.text
get_xsrf_pattern = re.compile(r'<input type="hidden" name="_xsrf" value="(.*?)"')
_xsrf = re.findall(get_xsrf_pattern, html)[0]
return _xsrf
def get_captcha():
t = str(int(time.time() * 1000))
captcha_url = 'https://www.zhihu.com/captcha.gif?r='

  • t "&type=login"
    response = session.get(captcha_url, headers=headers)
    with open('cptcha.gif', 'wb') as f:
    f.write(response.content)
    im = Image.open('cptcha.gif')
    im.show()
    captcha = raw_input('Verification code:')
    print captcha
    return captcha
    def login(username, password):
    if re.match(r'd{11}$', account):
    print('phone logining')
    url = 'http://www.zhihu.com/login/phone_num'
    data = {'_xsrf': get_xsrf(),
    'password': password,
    'remember_me': 'true',
    'phone_num': username
    }
    else:
    print('email longing')
    url = 'https://www.zhihu.com/login/email'
    data = {'_xsrf': get_xsrf(),
    'password': password,
    'remember_me': 'true',
    'email': username
    }
    data['captcha'] = get_captcha()
    result = session.post(url, data=data, headers=headers)
    print((json.loads(result.text))['msg'] ' codeLogin')
    # session.cookies.save(ignore_discard=True, ignore_expires=True)
    def nextMore(offset, start):
    url = 'https://www.zhihu.com/node/TopStory2FeedList'
    data = {'params': {'offset':offset, 'start':start},
    'method': 'next'
    }
    result = session.post(url, data=data, headers=headers)
    print((json.loads(result.text))['msg'] ' ')
    def download(url, headers, proxy, num_retries, data=None):
    headers = headers or {}
    print 'Downloading:', url
    request = urllib2.Request(url, data, headers)
    opener = urllib2.build_opener()
    if proxy:
    proxy_params = {urlparse.urlparse(url).scheme: proxy}
    opener.add_handler(urllib2.ProxyHandler(proxy_params))
    try:
    response = opener.open(request)
    html = response.read()
    code = response.code
    except urllib2.URLError as e:
    print 'Download error:', e.reason
    html = ''
    if hasattr(e, 'code'):
    code = e.code
    if num_retries > 0 and 500 <= code < 600:
    # retry 5XX HTTP errors
    return download(url, headers, proxy, num_retries - 1, data)
    else:
    code = None
    return html
    if name == 'main':
    account = raw_input('account:')
    secret = raw_input('password:')
    login(account, secret)
    get_url = 'https://www.zhihu.com/explore/recommendations'
    resp = session.get(get_url, headers=headers, allow_redirects=False)
    page = etree.HTML(resp.text)
    i = 1
    while (i<6):
    string = "//div[@id='zh-recommend']/div[2]/div[1]/div[" str(i)
  • "]/h2/a"
    hrefs = page.xpath(string)
    for href in hrefs:
    print href.text 'n' 'https://www.zhihu.com'
  • str(href.attrib['href'])
    url = 'https://www.zhihu.com'
  • str(href.attrib['href'])
    i = i 1
    webbrowser.open(get_url, new=0, autoraise=True)
    </code>

写着玩的

By 戴老花镜的莫林

代码:

1.从网址主页起先访谈,递归获取到链接,list1放全部碰到过的链接,list2放必要诉求的链接

开场状态list1、list2只包含主页的地址,size=1

当list2为空时,递总结束

2.央求链接时,推断重回的种类,假使是图片类型,就封存到文件夹

 

 

[python] 
# coding:utf-8  
import re, urllib2, os, datetime, urlparse 
 
def main(LEFT_PAGES): 
    if len(LEFT_PAGES) == 0: 
        print "未有索要拜谒的网页了,END" 
        return 
    else: 
        global MAIN_CNT 
        print "...第%s次递归步向MAIN函数..." % MAIN_CNT 
        tmp_pages = [] 
        for page in LEFT_PAGES: 
            tmp_pages.append(page) 
         
        for each in tmp_pages: 
            print "计划获取网页:%s" % each 
            try: 
                resp = urllib2.urlopen(each) 
            except urllib2.HTTPError as err: 
                print err.code, each 
                continue 
            finally: 
                LEFT_PAGES.remove(each) 
             
            source = resp.read() 
            current_url = resp.geturl() 
            content_type = resp.headers.get("Content-Type") 
            resp.close() 
            # 保存图片文件  
        if content_type is not None: 
            type1, type2 = content_type.split(";")[0].split("/") 
            if type1 is not None and type2 is not None and type1.lower() == "image": 
                src_dir = os.path.dirname(__file__) 
                filename = os.path.join(src_dir, "source", datetime.datetime.now().strftime("%Y%m%d.%H%M%S%f") "." type2) 
                fp = file(filename, "wb") 
                fp.write(source) 
                fp.close() 
            # 抽出链接  
            hrefs = re.findall(PATTERN, source) 
            if len(hrefs) > 0: 
                for each in hrefs: 
                    href = each[1] 
                    href = urlparse.urljoin(current_url, href) 
                    href = href.replace("/../", "/") 
                    if href not in HAS_MEET_PAGES: 
                        HAS_MEET_PAGES.append(href) 
                        if urlparse.urlparse(href).hostname is not None and "renrendai.com" in urlparse.urlparse(href).hostname: 
                            LEFT_PAGES.append(href) 
     
    MAIN_CNT = 1 
    main(LEFT_PAGES) 
 
if __name__ == '__main__': 
    VISIT_SITE = "" 
    HAS_MEET_PAGES = [VISIT_SITE] 
    LEFT_PAGES = [VISIT_SITE] 
    MAIN_CNT = 1 
    PATTERN = re.compile('(href|src|area)="([^s;] )"') 
    main(LEFT_PAGES) 

# coding:utf-8
import re, urllib2, os, datetime, urlparse

def main(LEFT_PAGES):
    if len(LEFT_PAGES) == 0:
        print "未有索要探望的网页了,END"
        return
    else:
        global MAIN_CNT
        print "...第%s次递归步入MAIN函数..." % MAIN_CNT
        tmp_pages = []
        for page in LEFT_PAGES:
            tmp_pages.append(page)
       
        for each in tmp_pages:
            print "谋算获取网页:%s" % each
            try:
                resp = urllib2.urlopen(each)
            except urllib2.HTTPError as err:
                print err.code, each
                continue
            finally:
                LEFT_PAGES.remove(each)
           
            source = resp.read()
            current_url = resp.geturl()
            content_type = resp.headers.get("Content-Type")
            resp.close()
            # 保存图片文件
        if content_type is not None:
            type1, type2 = content_type.split(";")[0].split("/")
            if type1 is not None and type2 is not None and type1.lower() == "image":
                src_dir = os.path.dirname(__file__)
                filename = os.path.join(src_dir, "source", datetime.datetime.now().strftime("%Y%m%d.%H%M%S%f") "." type2)
                fp = file(filename, "wb")
                fp.write(source)
                fp.close()
            # 抽出链接
            hrefs = re.findall(PATTERN, source)
            if len(hrefs) > 0:
                for each in hrefs:
                    href = each[1]
                    href = urlparse.urljoin(current_url, href)
                    href = href.replace("/../", "/")
                    if href not in HAS_MEET_PAGES:
                        HAS_MEET_PAGES.append(href)
                        if urlparse.urlparse(href).hostname is not None and "renrendai.com" in urlparse.urlparse(href).hostname:
                            LEFT_PAGES.append(href)
   
    MAIN_CNT = 1
    main(LEFT_PAGES)

if __name__ == '__main__':
    VISIT_SITE = ""
    HAS_MEET_PAGES = [VISIT_SITE]
    LEFT_PAGES = [VISIT_SITE]
    MAIN_CNT = 1
    PATTERN = re.compile('(href|src|area)="([^s;] )"')
    main(LEFT_PAGES)

 

 

 

运营结果:

 

图片 1

 

图片 2

 

图片 3

 

图片 4

 

 

 

 

 

 

 

 

 

 

写着玩的 代码: 1.从网址主页起头访谈,递归获取到链接,list1放全数蒙受过的链接,list2放供给央求的链接...

本文由必发88手机版发布,转载请注明来源:获取知乎热点新闻,爬虫爬取腾讯新闻科技类的