lrxianed 发布的文章

#!/user/bin/python
import requests
import urllib
import xlwt
import datetime
from lxml import etree
from urllib import request

def htmls(url):
    url = url.replace(" ", "")
    req = request.Request(url)
    head = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
    try:
        response2 = request.urlopen(req)
        html = response2.read()
        selector = etree.HTML(html)
        return selector
    except urllib.error.URLError as e:
        return

#获取人民网要闻文章链接
def firsthtml(url):
    header = {
          'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
    html = requests.get(url, headers=header)
    selector = etree.HTML(html.text)
    return selector

#此处要根据实际情况修改selector.xpath抓取内容的标签位置
def urls(url):
    selector = firsthtml(url)
    content_field1 = selector.xpath('/html/body/section[3]/div[2]/ul/li/strong/a')
    content_field2 = selector.xpath('/html/body/section[3]/div[2]/ul/li/a')
    content = content_field1 + content_field2
    urlss = []
    for content in content:
        urlss.append(content.attrib['href'])
    return urlss


def spider(url): #url处理函数
    print ('正在处理衔接'+str(num)+':', url)
    selector = htmls(url)
    if selector is None:
        print ('该链接未找到 -_-')
        return
    temp = {}
    try:
        title_path = selector.xpath('/html/body/div[4]/h1')
        content_path = selector.xpath('//*[@id="rwb_zw"]/p')
        time_path = selector.xpath('/html/body/div[4]/div/div[1]')
        source_path = selector.xpath('/html/body/div[4]/div/div[1]/a')
        temp['time'] = time_path[0].text[0:16]
        temp['source'] = source_path[0].text
        temp['title'] = title_path[0].text
    except:
        title_path = selector.xpath('/html/body/div[@class="clearfix w1000_320 text_title"]/h1')
        content_path = selector.xpath('/html/body/div[@class="fl text_con_left"]/p')
        source_path = selector.xpath('/html/body/div[3]/div/div[1]')
        time_path = selector.xpath('/html/body/div[4]/div/div[1]/a')
        try:
            temp['time'] = time_path[0][0:16]
            temp['source'] = source_path[0].text
            temp['title'] = title_path[0].text
        except:
            print ('该链接爬取失败 -_-')
            return
    scontent = ''
    for content in content_path:
        scontent = scontent + content.text
    temp['content'] = scontent
    temp['url'] = url
    all.append(temp)
    print ("成功爬取该链接 ^.^")

#将数据写入Excel表格
def toexcel(datas):
    num1 = 1
    new_time = datetime.datetime.now().strftime('%Y-%m-%d')
    file_address = r"e:\要闻 "+ new_time + '.xls'
    book = xlwt.Workbook(encoding='utf-8', style_compression=0)
    sheet = book.add_sheet('要闻', cell_overwrite_ok=True)
    sheet.write(0, 0, '新闻链接')
    sheet.write(0, 1, '新闻标题')
    sheet.write(0, 2, '刊发时间')
    sheet.write(0, 3, '新闻来源')
    sheet.write(0, 4, '新闻内容')
    for data in datas:
        sheet.write(num1, 0, data['url'])
        sheet.write(num1, 1, data['title'])
        sheet.write(num1, 2, data['time'])
        sheet.write(num1, 3, data['source'])
        sheet.write(num1, 4, data['content'])
        num1 = num1 + 1
    book.save(file_address)

if __name__ == '__main__':
    num = 1
    all = []
    urlss  =  urls('http://www.people.com.cn/')
    for x in urlss:
        spider(x)
        num = num + 1
    toexcel(all)

此前浏览了一条知乎为什么真有人得月子病,却有人说坐月子是陋习?
给了我一些与父辈处理问题的启发。在“要不要坐月子”这个问题上,年轻一代和父母之间存在着“严重”分歧,同样需要事先做好沟通以及明确各自的底线,尽量避免冲突,维系各方的感情。说是“坐月子”,岂是简单的做不做的问题?说是“装修”,又岂是“装修”的问题?