您现在的位置是:Python分析网页并抓取内容

Python分析网页并抓取内容

分类: Python 日期:2019-08-09点击:1338
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests
import re
import os

from lxml import etree
import html

CITY = 'city'
USERNAME = 'user'
PASSWORD = 'name'

# 登录缓存
def getSession(username, password):
    # 登录页
    LOGIN_URL = 'http://www.test.com/index.php?ajax=1'
    # 账号密码
    DATA = {"com":'com_passport',"method":'dologin',"ID":username,"PWD":password,"checkbox":'on'}
    # 浏览器
    HEADERS = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    # 保存登录参数
    ROOM_SESSION  = requests.Session()
    ROOM_SESSION.post(LOGIN_URL,data=DATA,headers=HEADERS)
    return ROOM_SESSION

# 获取列表
def getPageUrl(num):
    PAGE_BASE = 'http://www.test.com/index.php?method=index&app=order&page='
    return PAGE_BASE + str(num)

# 保存文件
def saveFile(text, name):
    fo = open(name, "w")
    fo.write(text)
    fo.close()

# 错误记录
def addLog(type, text):
    ERR_FILE = CITY + '_' + type + '.txt'
    ferror = open(ERR_FILE, "a")
    ferror.write(text)
    ferror.write("\n")
    ferror.close()

##########################################################

# 城市文件
FILE_NAME = CITY + '_index.html'
# 打开文件
fo = open(FILE_NAME, 'r', encoding='utf-8')
text = fo.read()
fo.close()

# 登录
SESSION =getSession(USERNAME, PASSWORD)

# 提取URL
html = etree.HTML(text)
url_data = html.xpath('//a/@href')
for ORDER_URL in url_data:
    ORDER_RES = SESSION.get(ORDER_URL)
    # 日志
    addLog('order_log', ORDER_URL)
    print(ORDER_URL)
    print(ORDER_RES.status_code)
    # 获取页面
    if(ORDER_RES.status_code!=200):
        addLog('order_error', '获取'+ ORDER_URL +'失败')
    else:
        addLog('success', '获取'+ ORDER_URL +'成功')
        ORDER_TEXT = ORDER_RES.text
        matchObj = re.search(r"app=order(.*)", ORDER_URL)
        ORDER_NAME = matchObj.group()
        ORDER_FILE = CITY + '/' + ORDER_NAME + '.html'
        # 保存文件
        saveFile(ORDER_TEXT,ORDER_FILE)
        print(ORDER_FILE)
标签: Python

站长简介

    姓名:喻理
    微信:yuli0927
    邮箱:yuli0927@126.com
    不懂业务的运维工程师不是一个好程序员。

分类

最新文章

热门文章

全站标签