lrxianed 发布的文章

从前,几年前,那时还没有今日头条、公众号、自媒体的时候,我都是通过“网易云阅读”订阅感兴趣的栏目来获取资讯的,还有鲜果阅读,满满的回忆。那时候获取资讯需要自己选择,需要自己去找寻,如今大数据时代,AI智能时代,资讯app都学会自己推送信息给你了,而且永远也看不完,一直刷就一直推送给你。有限的资讯,我还可以用剩余的时间做其他有意义的事。无限的资讯,无限地刷屏,没有尽头,时间就好像被绑架了。

今天突发奇想,想重新使用这些阅读器,我登录了久违的网易云阅读,查看我的订阅,但我却发现,很多,绝大多数的订阅栏目已经停更了。再翻翻精品订阅栏目,也全都一片沉寂,几乎都停更了。阅读器没有了资源,那就废了。我才突然醒过来,大家已经不再维护这些订阅栏目了,全都跑到微信公众号、今日头条、抖音等自媒体平台去了。

天啊,毫不夸张地说,微信公众号就是中国RSS啊!

所以,当下把微信公众号用起来才是王道。好吧,我选择接受。

#!/user/bin/python
import requests
import urllib
import xlwt
import datetime
from lxml import etree
from urllib import request

def htmls(url):
    url = url.replace(" ", "")
    req = request.Request(url)
    head = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
    try:
        response2 = request.urlopen(req)
        html = response2.read()
        selector = etree.HTML(html)
        return selector
    except urllib.error.URLError as e:
        return

#获取人民网要闻文章链接
def firsthtml(url):
    header = {
          'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
    html = requests.get(url, headers=header)
    selector = etree.HTML(html.text)
    return selector

#此处要根据实际情况修改selector.xpath抓取内容的标签位置
def urls(url):
    selector = firsthtml(url)
    content_field1 = selector.xpath('/html/body/section[3]/div[2]/ul/li/strong/a')
    content_field2 = selector.xpath('/html/body/section[3]/div[2]/ul/li/a')
    content = content_field1 + content_field2
    urlss = []
    for content in content:
        urlss.append(content.attrib['href'])
    return urlss


def spider(url): #url处理函数
    print ('正在处理衔接'+str(num)+':', url)
    selector = htmls(url)
    if selector is None:
        print ('该链接未找到 -_-')
        return
    temp = {}
    try:
        title_path = selector.xpath('/html/body/div[4]/h1')
        content_path = selector.xpath('//*[@id="rwb_zw"]/p')
        time_path = selector.xpath('/html/body/div[4]/div/div[1]')
        source_path = selector.xpath('/html/body/div[4]/div/div[1]/a')
        temp['time'] = time_path[0].text[0:16]
        temp['source'] = source_path[0].text
        temp['title'] = title_path[0].text
    except:
        title_path = selector.xpath('/html/body/div[@class="clearfix w1000_320 text_title"]/h1')
        content_path = selector.xpath('/html/body/div[@class="fl text_con_left"]/p')
        source_path = selector.xpath('/html/body/div[3]/div/div[1]')
        time_path = selector.xpath('/html/body/div[4]/div/div[1]/a')
        try:
            temp['time'] = time_path[0][0:16]
            temp['source'] = source_path[0].text
            temp['title'] = title_path[0].text
        except:
            print ('该链接爬取失败 -_-')
            return
    scontent = ''
    for content in content_path:
        scontent = scontent + content.text
    temp['content'] = scontent
    temp['url'] = url
    all.append(temp)
    print ("成功爬取该链接 ^.^")

#将数据写入Excel表格
def toexcel(datas):
    num1 = 1
    new_time = datetime.datetime.now().strftime('%Y-%m-%d')
    file_address = r"e:\要闻 "+ new_time + '.xls'
    book = xlwt.Workbook(encoding='utf-8', style_compression=0)
    sheet = book.add_sheet('要闻', cell_overwrite_ok=True)
    sheet.write(0, 0, '新闻链接')
    sheet.write(0, 1, '新闻标题')
    sheet.write(0, 2, '刊发时间')
    sheet.write(0, 3, '新闻来源')
    sheet.write(0, 4, '新闻内容')
    for data in datas:
        sheet.write(num1, 0, data['url'])
        sheet.write(num1, 1, data['title'])
        sheet.write(num1, 2, data['time'])
        sheet.write(num1, 3, data['source'])
        sheet.write(num1, 4, data['content'])
        num1 = num1 + 1
    book.save(file_address)

if __name__ == '__main__':
    num = 1
    all = []
    urlss  =  urls('http://www.people.com.cn/')
    for x in urlss:
        spider(x)
        num = num + 1
    toexcel(all)

此前浏览了一条知乎为什么真有人得月子病,却有人说坐月子是陋习?
给了我一些与父辈处理问题的启发。在“要不要坐月子”这个问题上,年轻一代和父母之间存在着“严重”分歧,同样需要事先做好沟通以及明确各自的底线,尽量避免冲突,维系各方的感情。说是“坐月子”,岂是简单的做不做的问题?说是“装修”,又岂是“装修”的问题?