大神论坛

找回密码
快速注册
查看: 322 | 回复: 1

[原创] 报纸下载器 目前支持人民日报,江西日报,赣南日报 可扩展

主题

帖子

0

积分

初入江湖

UID
668
积分
0
精华
威望
0 点
违规
大神币
68 枚
注册时间
2023-10-14 10:49
发表于 2023-12-17 15:34
本帖最后由 弑神者91511 于 2023-12-17 15:34 编辑

勾选报纸种类、选择日期,然后开始下载即可
目前仅添加了三种的报纸种类,理论还可以扩展

一、主界面
主界面是QTDesigner画的,再转成py代码


# -*- coding: utf-8 -*-

# Form implementation generated from reading ui file 'UI.ui'
#
# Created by: PyQt5 UI code generator 5.15.9
#
# WARNING: Any manual changes made to this file will be lost when pyuic5 is
# run again. Do not edit this file unless you know what you are doing.


from PyQt5 import QtCore, QtGui, QtWidgets


class Ui_MainWindow(object):
def setupUi(self, MainWindow):
screen = QtWidgets.QDesktopWidget().screenGeometry() # 自动适应屏幕宽高
width = int(screen.width() / 4)
height = int(screen.height() / 2.5)

MainWindow.setObjectName("MainWindow")
MainWindow.setEnabled(True)
MainWindow.resize(width, height)
MainWindow.setMinimumSize(QtCore.QSize(415, 515))
MainWindow.setAcceptDrops(True)
icon = QtGui.QIcon()
icon.addPixmap(QtGui.QPixmap(resource_path('paper.ico')), QtGui.QIcon.Normal, QtGui.QIcon.Off)
MainWindow.setWindowIcon(icon)
MainWindow.setLayoutDirection(QtCore.Qt.LeftToRight)
MainWindow.setLocale(QtCore.QLocale(QtCore.QLocale.Chinese, QtCore.QLocale.China))
self.centralwidget = QtWidgets.QWidget(MainWindow)
self.centralwidget.setMinimumSize(QtCore.QSize(415, 550))
self.centralwidget.setObjectName("centralwidget")
self.verticalLayout = QtWidgets.QVBoxLayout(self.centralwidget)
self.verticalLayout.setObjectName("verticalLayout")
self.horizontalLayout = QtWidgets.QHBoxLayout()
self.horizontalLayout.setObjectName("horizontalLayout")
self.verticalLayout_2 = QtWidgets.QVBoxLayout()
self.verticalLayout_2.setObjectName("verticalLayout_2")
self.label_2 = QtWidgets.QLabel(self.centralwidget)
self.label_2.setAlignment(QtCore.Qt.AlignCenter)
self.label_2.setObjectName("label_2")
self.verticalLayout_2.addWidget(self.label_2, 0, QtCore.Qt.AlignHCenter | QtCore.Qt.AlignVCenter)
spacerItem = QtWidgets.QSpacerItem(20, 40, QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Expanding)
self.verticalLayout_2.addItem(spacerItem)
self.checkBox = QtWidgets.QCheckBox(self.centralwidget)
self.checkBox.setChecked(True)
self.checkBox.setTristate(False)
self.checkBox.setObjectName("checkBox")
self.verticalLayout_2.addWidget(self.checkBox)
spacerItem1 = QtWidgets.QSpacerItem(20, 40, QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Expanding)
self.verticalLayout_2.addItem(spacerItem1)
self.checkBox_3 = QtWidgets.QCheckBox(self.centralwidget)
self.checkBox_3.setChecked(True)
self.checkBox_3.setObjectName("checkBox_3")
self.verticalLayout_2.addWidget(self.checkBox_3)
spacerItem2 = QtWidgets.QSpacerItem(20, 40, QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Expanding)
self.verticalLayout_2.addItem(spacerItem2)
self.checkBox_2 = QtWidgets.QCheckBox(self.centralwidget)
self.checkBox_2.setChecked(True)
self.checkBox_2.setObjectName("checkBox_2")
self.verticalLayout_2.addWidget(self.checkBox_2)
spacerItem3 = QtWidgets.QSpacerItem(20, 40, QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Expanding)
self.verticalLayout_2.addItem(spacerItem3)
self.horizontalLayout.addLayout(self.verticalLayout_2)
self.verticalLayout_4 = QtWidgets.QVBoxLayout()
self.verticalLayout_4.setObjectName("verticalLayout_4")
self.label = QtWidgets.QLabel(self.centralwidget)
self.label.setObjectName("label")
self.verticalLayout_4.addWidget(self.label, 0, QtCore.Qt.AlignHCenter | QtCore.Qt.AlignVCenter)
self.calendarWidget = QtWidgets.QCalendarWidget(self.centralwidget)
self.calendarWidget.setMinimumSize(QtCore.QSize(314, 244))
self.calendarWidget.setGridVisible(True)
self.calendarWidget.setSelectionMode(QtWidgets.QCalendarWidget.SingleSelection)
self.calendarWidget.setHorizontalHeaderFormat(QtWidgets.QCalendarWidget.ShortDayNames)
self.calendarWidget.setVerticalHeaderFormat(QtWidgets.QCalendarWidget.NoVerticalHeader)
self.calendarWidget.setDateEditEnabled(False)
self.calendarWidget.setObjectName("calendarWidget")
self.verticalLayout_4.addWidget(self.calendarWidget)
self.horizontalLayout.addLayout(self.verticalLayout_4)
self.verticalLayout.addLayout(self.horizontalLayout)
self.horizontalLayout_2 = QtWidgets.QHBoxLayout()
self.horizontalLayout_2.setObjectName("horizontalLayout_2")
self.pushButton = QtWidgets.QPushButton(self.centralwidget)
self.pushButton.setObjectName("pushButton")
self.horizontalLayout_2.addWidget(self.pushButton)
spacerItem4 = QtWidgets.QSpacerItem(40, 20, QtWidgets.QSizePolicy.Expanding, QtWidgets.QSizePolicy.Minimum)
self.horizontalLayout_2.addItem(spacerItem4)
self.pushButton_2 = QtWidgets.QPushButton(self.centralwidget)
self.pushButton_2.setObjectName("pushButton_2")
self.horizontalLayout_2.addWidget(self.pushButton_2)
self.verticalLayout.addLayout(self.horizontalLayout_2)
self.horizontalLayout_5 = QtWidgets.QHBoxLayout()
self.horizontalLayout_5.setObjectName("horizontalLayout_5")
self.textEdit = QtWidgets.QTextEdit(self.centralwidget)
self.textEdit.setEnabled(True)
self.textEdit.setMinimumSize(QtCore.QSize(395, 230))
self.textEdit.setObjectName("textEdit")
self.horizontalLayout_5.addWidget(self.textEdit)
self.verticalLayout.addLayout(self.horizontalLayout_5)
MainWindow.setCentralWidget(self.centralwidget)

self.retranslateUi(MainWindow)
QtCore.QMetaObject.connectSlotsByName(MainWindow)

def retranslateUi(self, MainWindow):
_translate = QtCore.QCoreApplication.translate
MainWindow.setWindowTitle(_translate("MainWindow", "报纸下载器"))
self.label_2.setText(_translate("MainWindow", "报纸种类"))
self.checkBox.setText(_translate("MainWindow", "人民日报"))
self.checkBox_3.setText(_translate("MainWindow", "江西日报"))
self.checkBox_2.setText(_translate("MainWindow", "赣南日报"))
self.label.setText(_translate("MainWindow", "选择日期"))
self.pushButton.setText(_translate("MainWindow", "开始下载"))
self.pushButton_2.setText(_translate("MainWindow", "查看下载"))


def resource_path(relative):
import os, sys
if hasattr(sys, "_MEIPASS"):
absolute_path = os.path.join(sys._MEIPASS, relative)
else:
absolute_path = os.path.join(relative)
return absolute_path

二、逻辑部分
1.RMRB下载

from get_page import get_page, make_dir, header, merge_pdf, handle_string
from lxml import etree
import os
import requests
import time


def download(url, date, year, month, day, date_path, paper_name):
response = get_page(url)

seclector = etree.HTML(response)
chapter_urls = seclector.xpath('//div[@]/a/@href')
chapter_names = seclector.xpath('//div[@]/a/text()')

print(f'{paper_name}({date}) 共有 {len(chapter_urls)} 个版面')

for chapter_name, chapter_url in zip(chapter_names, chapter_urls):
chapter_name = handle_string(chapter_name)
# print(chapter_name)
if os.path.exists(f'{date_path}/{chapter_name}.pdf'):
pass
else:
chapter_url = f'http://paper.people.com.cn/rmrb/html/{year}-{month}/{day}/nbs.D110000renmrb_{str(chapter_urls.index(chapter_url) + 1).zfill(2)}.htm'
chapter_response = get_page(chapter_url)
chapter_seclector = etree.HTML(chapter_response)
chapter_pdf_dowload_url = chapter_seclector.xpath('//p[@]/a/@href')[0]
chapter_pdf_dowload_url = [x for x in chapter_pdf_dowload_url.split('/')[-5:]]
# # http://paper.people.com.cn/rmrb/images/2023-11/13/01/rmrb2023111301.pdf
chapter_pdf_dowload_url = 'http://paper.people.com.cn/rmrb/' + '/'.join(chapter_pdf_dowload_url)

#
pdf_response = requests.get(chapter_pdf_dowload_url, headers=header)
if pdf_response.headers['Content-Type'] == 'application/pdf':
with open(f'{date_path}/{chapter_name}.pdf', 'wb') as f:
f.write(pdf_response.content)
print(f'{paper_name}({date}) {chapter_name} 保存完毕!!!')

time.sleep(1)
merge_pdf(date, date_path, paper_name)


def main(paper_name, date):
year = date.split('-')[0]
month = date.split('-')[1]
day = date.split('-')[2]

date_path = f'{paper_name}/{date}'
make_dir(date_path)

url = f'http://paper.people.com.cn/rmrb/html/{year}-{month}/{day}/nbs.D110000renmrb_01.htm'

download(url, date, year, month, day, date_path, paper_name)


if __name__ == '__main__':
paper_name = '人民日报'
date = '2023-12-12'
main(paper_name, date)

2.JXRB下载

import requests
import os
import time
from lxml import etree

from get_page import get_page, make_dir, header, merge_pdf,handle_string


def download(url, date, year, month, day, date_path, paper_name):
response = get_page(url)
seclector = etree.HTML(response)
chapter_urls = seclector.xpath('//td[@]/a/@href')
chapter_names = seclector.xpath('//td[@]/a/text()')

print(f'{paper_name}({date}) 共有 {len(chapter_urls)} 个版面')

for chapter_name, chapter_url in zip(chapter_names, chapter_urls):
chapter_name = handle_string(chapter_name)
# print(chapter_name)
if os.path.exists(f'{date_path}/{chapter_name}.pdf'):
pass
else:
chapter_url = f'http://epaper.jxxw.com.cn/html/{year}-{month}/{day}/{chapter_url}'
chapter_response = get_page(chapter_url)
chapter_seclector = etree.HTML(chapter_response)
chapter_pdf_dowload_url = chapter_seclector.xpath('//a[@id="bigbmshowpdf"]/@href')[0]
chapter_pdf_dowload_url = [x for x in chapter_pdf_dowload_url.split('/')[-3:]]
chapter_pdf_dowload_url = 'http://epaper.jxxw.com.cn/resfile/' + '/'.join(chapter_pdf_dowload_url)

pdf_response = requests.get(chapter_pdf_dowload_url, headers=header)
if pdf_response.headers['Content-Type'] == 'application/pdf':
with open(f'{date_path}/{chapter_name}.pdf', 'wb') as f:
f.write(pdf_response.content)
print(f'{paper_name}({date}) {chapter_name} 保存完毕!!!')

time.sleep(1)
merge_pdf(date, date_path, paper_name)


def main(paper_name, date):
year = date.split('-')[0]
month = date.split('-')[1]
day = date.split('-')[2]

date_path = f'{paper_name}/{date}'
make_dir(date_path)

url = f'http://epaper.jxxw.com.cn/html/{year}-{month}/{day}/index_{date}.htm'
download(url, date, year, month, day, date_path, paper_name)


if __name__ == '__main__':
paper_name = '江西日报'
date = '2023-12-12'
main(paper_name, date)

3.GNRB下载

import requests
import os
import time
from lxml import etree

from get_page import get_page, make_dir, header, merge_pdf,handle_string


def download(url, date, year, month, day, date_path, paper_name):
response = get_page(url)
# print(response)
seclector = etree.HTML(response)
chapter_urls = seclector.xpath('//td[@]/a/@href')
chapter_names = seclector.xpath('//td[@]/a/text()')

print(f'{paper_name}({date}) 共有 {len(chapter_urls)} 个版面')

for chapter_name, chapter_url in zip(chapter_names, chapter_urls):
chapter_name = handle_string(chapter_name)
if os.path.exists(f'{date_path}/{chapter_name}.pdf'):
print(f'{paper_name}({date}) {chapter_name} 保存完毕~~~')
else:
# https://szb.gnrbs.cn/html/2023-12/16/node_95762.htm
chapter_url = f'https://szb.gnrbs.cn/html/{year}-{month}/{day}/{chapter_url}'
chapter_response = get_page(chapter_url)
chapter_seclector = etree.HTML(chapter_response)
chapter_pdf_dowload_url = chapter_seclector.xpath('//a[@id="bigbmshowpdf"]/@href')[0]
chapter_pdf_dowload_url = [x for x in chapter_pdf_dowload_url.split('/')[-3:]]

# https://szb.gnrbs.cn/resfile/2023-12-16/01/gnrb-20231216-001.pdf
chapter_pdf_dowload_url = 'https://szb.gnrbs.cn/resfile/' + '/'.join(chapter_pdf_dowload_url)
# print(chapter_pdf_dowload_url, chapter_name)

pdf_response = requests.get(chapter_pdf_dowload_url, headers=header)
if pdf_response.headers['Content-Type'] == 'application/pdf':
with open(f'{date_path}/{chapter_name}.pdf', 'wb') as f:
f.write(pdf_response.content)
print(f'{paper_name}({date}) {chapter_name} 保存完毕!!!')

time.sleep(1)
#
print(f'{paper_name}({date}) 版面下载完毕,开始合并,请稍候……')
merge_pdf(date, date_path, paper_name)


def main(paper_name, date):

year = date.split('-')[0]
month = date.split('-')[1]
day = date.split('-')[2]

date_path = f'{paper_name}/{date}'
make_dir(date_path)

url = f'https://szb.gnrbs.cn/html/{year}-{month}/{day}/index_{year}-{month}-{day}.htm'
download(url, date, year, month, day, date_path, paper_name)


if __name__ == '__main__':
main()

4.调用逻辑

from PyQt5 import QtWidgets, QtCore, QtGui
from UI import Ui_MainWindow

from PyQt5.QtWidgets import *
from PyQt5.QtCore import *
from PyQt5.QtGui import *

import rmrb_downloader
import jxrb_downloader
import gnrb_downloader

import os


# 發射信號
class Stream(QObject):
newText = pyqtSignal(str)

def write(self, text):
self.newText.emit(str(text))
QApplication.processEvents()


class Thread(QThread):
def __init__(self, paper_selector_list=None, select_date=None, parent=None):
super(Thread, self).__init__(parent)

self.paper_selector_list = paper_selector_list
self.select_date = select_date

def __del__(self):
self.wait()

def run(self):
print('--------分--------隔--------线--------')
print(f'选择的报纸种类有:{self.paper_selector_list}')
print(f'选择的下载日期是:{self.select_date}')
try:
for self.paper_name in self.paper_selector_list:
if self.paper_name == '人民日报':
print('--------分--------隔--------线--------')
print(f'开始下载 {self.select_date} {self.paper_name}')
rmrb_downloader.main(self.paper_name, self.select_date)
if self.paper_name == '江西日报':
print('--------分--------隔--------线--------')
print(f'开始下载 {self.select_date} {self.paper_name}')
jxrb_downloader.main(self.paper_name, self.select_date)
if self.paper_name == '赣南日报':
print('--------分--------隔--------线--------')
print(f'开始下载 {self.select_date} {self.paper_name}')
gnrb_downloader.main(self.paper_name, self.select_date)
print('--------分--------隔--------线--------')

except Exception as e:
print(e)


class MainWindow(QtWidgets.QMainWindow, Ui_MainWindow):
def __init__(self, parent=None):
super(MainWindow, self).__init__(parent)
self.setupUi(self)
self.center()

# 发射信息
sys.stdout = Stream(newText=self.onUpdateText)
sys.stderr = Stream(newText=self.onUpdateText)

# 开始下载
self.pushButton.clicked.connect(self.start_download)
# 查看下载
self.pushButton_2.clicked.connect(self.show_download)

def start_download(self):
# 选择的日期
self.select_date = self.get_date()

# 选择的报纸种类
self.paper_selector_list = self.get_paper_name()

self.thread = Thread()
self.thread.paper_selector_list = self.paper_selector_list
self.thread.select_date = self.select_date
self.thread.start()

def show_download(self):
try:
os.startfile(f'{os.getcwd()}')
except Exception as e:
print(e)

def get_paper_name(self):
paper_selector_list = []
if self.checkBox.isChecked(): # 人民日报
paper_selector_list.append(self.checkBox.text())
if self.checkBox_3.isChecked(): # 赣南日报
paper_selector_list.append(self.checkBox_3.text())
if self.checkBox_2.isChecked(): # 江西日报
paper_selector_list.append(self.checkBox_2.text())

return paper_selector_list

def get_date(self):
date = QtCore.QDate(self.calendarWidget.selectedDate())
year = date.year()
month = str(date.month()).zfill(2) # 补齐2位
day = str(date.day()).zfill(2) # 补齐2位
select_date = f'{year}-{month}-{day}'
return select_date

def onUpdateText(self, text):
"""Write console output to text widget."""
cursor = self.textEdit.textCursor()
cursor.movePosition(QTextCursor.End)
cursor.insertText(text)
self.textEdit.setTextCursor(cursor)
self.textEdit.ensureCursorVisible()

def center(self):
qr = self.frameGeometry()
cp = QDesktopWidget().availableGeometry().center()
qr.moveCenter(cp)
self.move(qr.topLeft())

def closeEvent(self, event):
reply = QMessageBox.question(self, '退出提示',
"您确定要退出吗?", QMessageBox.Yes |
QMessageBox.No, QMessageBox.No)

if reply == QMessageBox.Yes:
event.accept()
elif reply == QMessageBox.No:
event.ignore()


def resource_path(relative):
import os, sys
if hasattr(sys, "_MEIPASS"):
absolute_path = os.path.join(sys._MEIPASS, relative)
else:
absolute_path = os.path.join(relative)
return absolute_path


if __name__ == '__main__':
import sys

app = QtWidgets.QApplication(sys.argv)
mainWindow = MainWindow()
mainWindow.show()
sys.exit(app.exec_())

5.其他代码,放到一个py文件里了

import requests
import os
from pypdf import PdfMerger, PdfReader
import shutil
import time
import re

header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}


def handle_string(string):
string = re.sub(r'\r\n *', '', str(string))
return string


def merge_pdf(date, date_path, paper_name):
pdf_list = os.listdir(date_path)
pdf_merge = PdfMerger()
pdf_list.sort()

# 记录每次合并后的总页数
pdf_page_num = 0
for pdf in pdf_list:
pdf_path = os.path.join(date_path, pdf)
pdf_in = PdfReader(pdf_path, strict=False)
# 每张报纸的标题
pdf_title = pdf.split('.')[0]

pdf_merge.append(pdf_path)
pdf_merge.add_outline_item(pdf_title, pdf_page_num, None)
pdf_page_num += len(pdf_in.pages)

pdf_merge.write(f'{paper_name}/{paper_name}({date}).pdf')
print(f'{paper_name}({date}).pdf下载完成')
pdf_merge.close()

time.sleep(1)
# 删除文件夹
shutil.rmtree(date_path)


def make_dir(path):
if os.path.exists(path):
pass
else:
os.makedirs(path)


def get_page(url):
response = requests.get(url, headers=header)
response.encoding = 'utf-8'
if response.status_code == 200:
return response.text

三、界面展示

四、不足之处
1.三个下载逻辑相似,可以集成到一个函数中调用
2.使用pypdf库合并pdf,会出现“Multiple definitions in dictionary at byte 0x13866c for key /Ascent”的信息,不过不影响使用
有知道怎样解决的大佬还望帮助解决

五、声明
此仅供学习研究使用,请勿用于其他用途

注:若转载请注明大神论坛来源(本贴地址)与作者信息。

主题

帖子

4

积分

初入江湖

UID
795
积分
4
精华
威望
8 点
违规
大神币
58 枚
注册时间
2024-02-03 12:00
发表于 2024-02-17 15:01:52.0
下下来看看报纸

返回顶部