本帖最后由 弑神者91511 于 2023-12-17 15:34 编辑
勾选报纸种类、选择日期,然后开始下载即可 目前仅添加了三种的报纸种类,理论还可以扩展
一、主界面 主界面是QTDesigner画的,再转成py代码
# -*- coding: utf-8 -*- # Form implementation generated from reading ui file 'UI.ui' # # Created by: PyQt5 UI code generator 5.15.9 # # WARNING: Any manual changes made to this file will be lost when pyuic5 is # run again. Do not edit this file unless you know what you are doing. from PyQt5 import QtCore, QtGui, QtWidgets class Ui_MainWindow(object): def setupUi(self, MainWindow): screen = QtWidgets.QDesktopWidget().screenGeometry() # 自动适应屏幕宽高 width = int(screen.width() / 4) height = int(screen.height() / 2.5) MainWindow.setObjectName("MainWindow") MainWindow.setEnabled(True) MainWindow.resize(width, height) MainWindow.setMinimumSize(QtCore.QSize(415, 515)) MainWindow.setAcceptDrops(True) icon = QtGui.QIcon() icon.addPixmap(QtGui.QPixmap(resource_path('paper.ico')), QtGui.QIcon.Normal, QtGui.QIcon.Off) MainWindow.setWindowIcon(icon) MainWindow.setLayoutDirection(QtCore.Qt.LeftToRight) MainWindow.setLocale(QtCore.QLocale(QtCore.QLocale.Chinese, QtCore.QLocale.China)) self.centralwidget = QtWidgets.QWidget(MainWindow) self.centralwidget.setMinimumSize(QtCore.QSize(415, 550)) self.centralwidget.setObjectName("centralwidget") self.verticalLayout = QtWidgets.QVBoxLayout(self.centralwidget) self.verticalLayout.setObjectName("verticalLayout") self.horizontalLayout = QtWidgets.QHBoxLayout() self.horizontalLayout.setObjectName("horizontalLayout") self.verticalLayout_2 = QtWidgets.QVBoxLayout() self.verticalLayout_2.setObjectName("verticalLayout_2") self.label_2 = QtWidgets.QLabel(self.centralwidget) self.label_2.setAlignment(QtCore.Qt.AlignCenter) self.label_2.setObjectName("label_2") self.verticalLayout_2.addWidget(self.label_2, 0, QtCore.Qt.AlignHCenter | QtCore.Qt.AlignVCenter) spacerItem = QtWidgets.QSpacerItem(20, 40, QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Expanding) self.verticalLayout_2.addItem(spacerItem) self.checkBox = QtWidgets.QCheckBox(self.centralwidget) self.checkBox.setChecked(True) self.checkBox.setTristate(False) self.checkBox.setObjectName("checkBox") self.verticalLayout_2.addWidget(self.checkBox) spacerItem1 = QtWidgets.QSpacerItem(20, 40, QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Expanding) self.verticalLayout_2.addItem(spacerItem1) self.checkBox_3 = QtWidgets.QCheckBox(self.centralwidget) self.checkBox_3.setChecked(True) self.checkBox_3.setObjectName("checkBox_3") self.verticalLayout_2.addWidget(self.checkBox_3) spacerItem2 = QtWidgets.QSpacerItem(20, 40, QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Expanding) self.verticalLayout_2.addItem(spacerItem2) self.checkBox_2 = QtWidgets.QCheckBox(self.centralwidget) self.checkBox_2.setChecked(True) self.checkBox_2.setObjectName("checkBox_2") self.verticalLayout_2.addWidget(self.checkBox_2) spacerItem3 = QtWidgets.QSpacerItem(20, 40, QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Expanding) self.verticalLayout_2.addItem(spacerItem3) self.horizontalLayout.addLayout(self.verticalLayout_2) self.verticalLayout_4 = QtWidgets.QVBoxLayout() self.verticalLayout_4.setObjectName("verticalLayout_4") self.label = QtWidgets.QLabel(self.centralwidget) self.label.setObjectName("label") self.verticalLayout_4.addWidget(self.label, 0, QtCore.Qt.AlignHCenter | QtCore.Qt.AlignVCenter) self.calendarWidget = QtWidgets.QCalendarWidget(self.centralwidget) self.calendarWidget.setMinimumSize(QtCore.QSize(314, 244)) self.calendarWidget.setGridVisible(True) self.calendarWidget.setSelectionMode(QtWidgets.QCalendarWidget.SingleSelection) self.calendarWidget.setHorizontalHeaderFormat(QtWidgets.QCalendarWidget.ShortDayNames) self.calendarWidget.setVerticalHeaderFormat(QtWidgets.QCalendarWidget.NoVerticalHeader) self.calendarWidget.setDateEditEnabled(False) self.calendarWidget.setObjectName("calendarWidget") self.verticalLayout_4.addWidget(self.calendarWidget) self.horizontalLayout.addLayout(self.verticalLayout_4) self.verticalLayout.addLayout(self.horizontalLayout) self.horizontalLayout_2 = QtWidgets.QHBoxLayout() self.horizontalLayout_2.setObjectName("horizontalLayout_2") self.pushButton = QtWidgets.QPushButton(self.centralwidget) self.pushButton.setObjectName("pushButton") self.horizontalLayout_2.addWidget(self.pushButton) spacerItem4 = QtWidgets.QSpacerItem(40, 20, QtWidgets.QSizePolicy.Expanding, QtWidgets.QSizePolicy.Minimum) self.horizontalLayout_2.addItem(spacerItem4) self.pushButton_2 = QtWidgets.QPushButton(self.centralwidget) self.pushButton_2.setObjectName("pushButton_2") self.horizontalLayout_2.addWidget(self.pushButton_2) self.verticalLayout.addLayout(self.horizontalLayout_2) self.horizontalLayout_5 = QtWidgets.QHBoxLayout() self.horizontalLayout_5.setObjectName("horizontalLayout_5") self.textEdit = QtWidgets.QTextEdit(self.centralwidget) self.textEdit.setEnabled(True) self.textEdit.setMinimumSize(QtCore.QSize(395, 230)) self.textEdit.setObjectName("textEdit") self.horizontalLayout_5.addWidget(self.textEdit) self.verticalLayout.addLayout(self.horizontalLayout_5) MainWindow.setCentralWidget(self.centralwidget) self.retranslateUi(MainWindow) QtCore.QMetaObject.connectSlotsByName(MainWindow) def retranslateUi(self, MainWindow): _translate = QtCore.QCoreApplication.translate MainWindow.setWindowTitle(_translate("MainWindow", "报纸下载器")) self.label_2.setText(_translate("MainWindow", "报纸种类")) self.checkBox.setText(_translate("MainWindow", "人民日报")) self.checkBox_3.setText(_translate("MainWindow", "江西日报")) self.checkBox_2.setText(_translate("MainWindow", "赣南日报")) self.label.setText(_translate("MainWindow", "选择日期")) self.pushButton.setText(_translate("MainWindow", "开始下载")) self.pushButton_2.setText(_translate("MainWindow", "查看下载")) def resource_path(relative): import os, sys if hasattr(sys, "_MEIPASS"): absolute_path = os.path.join(sys._MEIPASS, relative) else: absolute_path = os.path.join(relative) return absolute_path
二、逻辑部分 1.RMRB下载 from get_page import get_page, make_dir, header, merge_pdf, handle_string from lxml import etree import os import requests import time def download(url, date, year, month, day, date_path, paper_name): response = get_page(url) seclector = etree.HTML(response) chapter_urls = seclector.xpath('//div[@]/a/@href') chapter_names = seclector.xpath('//div[@]/a/text()') print(f'{paper_name}({date}) 共有 {len(chapter_urls)} 个版面') for chapter_name, chapter_url in zip(chapter_names, chapter_urls): chapter_name = handle_string(chapter_name) # print(chapter_name) if os.path.exists(f'{date_path}/{chapter_name}.pdf'): pass else: chapter_url = f'http://paper.people.com.cn/rmrb/html/{year}-{month}/{day}/nbs.D110000renmrb_{str(chapter_urls.index(chapter_url) + 1).zfill(2)}.htm' chapter_response = get_page(chapter_url) chapter_seclector = etree.HTML(chapter_response) chapter_pdf_dowload_url = chapter_seclector.xpath('//p[@]/a/@href')[0] chapter_pdf_dowload_url = [x for x in chapter_pdf_dowload_url.split('/')[-5:]] # # http://paper.people.com.cn/rmrb/images/2023-11/13/01/rmrb2023111301.pdf chapter_pdf_dowload_url = 'http://paper.people.com.cn/rmrb/' + '/'.join(chapter_pdf_dowload_url) # pdf_response = requests.get(chapter_pdf_dowload_url, headers=header) if pdf_response.headers['Content-Type'] == 'application/pdf': with open(f'{date_path}/{chapter_name}.pdf', 'wb') as f: f.write(pdf_response.content) print(f'{paper_name}({date}) {chapter_name} 保存完毕!!!') time.sleep(1) merge_pdf(date, date_path, paper_name) def main(paper_name, date): year = date.split('-')[0] month = date.split('-')[1] day = date.split('-')[2] date_path = f'{paper_name}/{date}' make_dir(date_path) url = f'http://paper.people.com.cn/rmrb/html/{year}-{month}/{day}/nbs.D110000renmrb_01.htm' download(url, date, year, month, day, date_path, paper_name) if __name__ == '__main__': paper_name = '人民日报' date = '2023-12-12' main(paper_name, date)
2.JXRB下载 import requests import os import time from lxml import etree from get_page import get_page, make_dir, header, merge_pdf,handle_string def download(url, date, year, month, day, date_path, paper_name): response = get_page(url) seclector = etree.HTML(response) chapter_urls = seclector.xpath('//td[@]/a/@href') chapter_names = seclector.xpath('//td[@]/a/text()') print(f'{paper_name}({date}) 共有 {len(chapter_urls)} 个版面') for chapter_name, chapter_url in zip(chapter_names, chapter_urls): chapter_name = handle_string(chapter_name) # print(chapter_name) if os.path.exists(f'{date_path}/{chapter_name}.pdf'): pass else: chapter_url = f'http://epaper.jxxw.com.cn/html/{year}-{month}/{day}/{chapter_url}' chapter_response = get_page(chapter_url) chapter_seclector = etree.HTML(chapter_response) chapter_pdf_dowload_url = chapter_seclector.xpath('//a[@id="bigbmshowpdf"]/@href')[0] chapter_pdf_dowload_url = [x for x in chapter_pdf_dowload_url.split('/')[-3:]] chapter_pdf_dowload_url = 'http://epaper.jxxw.com.cn/resfile/' + '/'.join(chapter_pdf_dowload_url) pdf_response = requests.get(chapter_pdf_dowload_url, headers=header) if pdf_response.headers['Content-Type'] == 'application/pdf': with open(f'{date_path}/{chapter_name}.pdf', 'wb') as f: f.write(pdf_response.content) print(f'{paper_name}({date}) {chapter_name} 保存完毕!!!') time.sleep(1) merge_pdf(date, date_path, paper_name) def main(paper_name, date): year = date.split('-')[0] month = date.split('-')[1] day = date.split('-')[2] date_path = f'{paper_name}/{date}' make_dir(date_path) url = f'http://epaper.jxxw.com.cn/html/{year}-{month}/{day}/index_{date}.htm' download(url, date, year, month, day, date_path, paper_name) if __name__ == '__main__': paper_name = '江西日报' date = '2023-12-12' main(paper_name, date)
3.GNRB下载 import requests import os import time from lxml import etree from get_page import get_page, make_dir, header, merge_pdf,handle_string def download(url, date, year, month, day, date_path, paper_name): response = get_page(url) # print(response) seclector = etree.HTML(response) chapter_urls = seclector.xpath('//td[@]/a/@href') chapter_names = seclector.xpath('//td[@]/a/text()') print(f'{paper_name}({date}) 共有 {len(chapter_urls)} 个版面') for chapter_name, chapter_url in zip(chapter_names, chapter_urls): chapter_name = handle_string(chapter_name) if os.path.exists(f'{date_path}/{chapter_name}.pdf'): print(f'{paper_name}({date}) {chapter_name} 保存完毕~~~') else: # https://szb.gnrbs.cn/html/2023-12/16/node_95762.htm chapter_url = f'https://szb.gnrbs.cn/html/{year}-{month}/{day}/{chapter_url}' chapter_response = get_page(chapter_url) chapter_seclector = etree.HTML(chapter_response) chapter_pdf_dowload_url = chapter_seclector.xpath('//a[@id="bigbmshowpdf"]/@href')[0] chapter_pdf_dowload_url = [x for x in chapter_pdf_dowload_url.split('/')[-3:]] # https://szb.gnrbs.cn/resfile/2023-12-16/01/gnrb-20231216-001.pdf chapter_pdf_dowload_url = 'https://szb.gnrbs.cn/resfile/' + '/'.join(chapter_pdf_dowload_url) # print(chapter_pdf_dowload_url, chapter_name) pdf_response = requests.get(chapter_pdf_dowload_url, headers=header) if pdf_response.headers['Content-Type'] == 'application/pdf': with open(f'{date_path}/{chapter_name}.pdf', 'wb') as f: f.write(pdf_response.content) print(f'{paper_name}({date}) {chapter_name} 保存完毕!!!') time.sleep(1) # print(f'{paper_name}({date}) 版面下载完毕,开始合并,请稍候……') merge_pdf(date, date_path, paper_name) def main(paper_name, date): year = date.split('-')[0] month = date.split('-')[1] day = date.split('-')[2] date_path = f'{paper_name}/{date}' make_dir(date_path) url = f'https://szb.gnrbs.cn/html/{year}-{month}/{day}/index_{year}-{month}-{day}.htm' download(url, date, year, month, day, date_path, paper_name) if __name__ == '__main__': main()
4.调用逻辑 from PyQt5 import QtWidgets, QtCore, QtGui from UI import Ui_MainWindow from PyQt5.QtWidgets import * from PyQt5.QtCore import * from PyQt5.QtGui import * import rmrb_downloader import jxrb_downloader import gnrb_downloader import os # 發射信號 class Stream(QObject): newText = pyqtSignal(str) def write(self, text): self.newText.emit(str(text)) QApplication.processEvents() class Thread(QThread): def __init__(self, paper_selector_list=None, select_date=None, parent=None): super(Thread, self).__init__(parent) self.paper_selector_list = paper_selector_list self.select_date = select_date def __del__(self): self.wait() def run(self): print('--------分--------隔--------线--------') print(f'选择的报纸种类有:{self.paper_selector_list}') print(f'选择的下载日期是:{self.select_date}') try: for self.paper_name in self.paper_selector_list: if self.paper_name == '人民日报': print('--------分--------隔--------线--------') print(f'开始下载 {self.select_date} {self.paper_name}') rmrb_downloader.main(self.paper_name, self.select_date) if self.paper_name == '江西日报': print('--------分--------隔--------线--------') print(f'开始下载 {self.select_date} {self.paper_name}') jxrb_downloader.main(self.paper_name, self.select_date) if self.paper_name == '赣南日报': print('--------分--------隔--------线--------') print(f'开始下载 {self.select_date} {self.paper_name}') gnrb_downloader.main(self.paper_name, self.select_date) print('--------分--------隔--------线--------') except Exception as e: print(e) class MainWindow(QtWidgets.QMainWindow, Ui_MainWindow): def __init__(self, parent=None): super(MainWindow, self).__init__(parent) self.setupUi(self) self.center() # 发射信息 sys.stdout = Stream(newText=self.onUpdateText) sys.stderr = Stream(newText=self.onUpdateText) # 开始下载 self.pushButton.clicked.connect(self.start_download) # 查看下载 self.pushButton_2.clicked.connect(self.show_download) def start_download(self): # 选择的日期 self.select_date = self.get_date() # 选择的报纸种类 self.paper_selector_list = self.get_paper_name() self.thread = Thread() self.thread.paper_selector_list = self.paper_selector_list self.thread.select_date = self.select_date self.thread.start() def show_download(self): try: os.startfile(f'{os.getcwd()}') except Exception as e: print(e) def get_paper_name(self): paper_selector_list = [] if self.checkBox.isChecked(): # 人民日报 paper_selector_list.append(self.checkBox.text()) if self.checkBox_3.isChecked(): # 赣南日报 paper_selector_list.append(self.checkBox_3.text()) if self.checkBox_2.isChecked(): # 江西日报 paper_selector_list.append(self.checkBox_2.text()) return paper_selector_list def get_date(self): date = QtCore.QDate(self.calendarWidget.selectedDate()) year = date.year() month = str(date.month()).zfill(2) # 补齐2位 day = str(date.day()).zfill(2) # 补齐2位 select_date = f'{year}-{month}-{day}' return select_date def onUpdateText(self, text): """Write console output to text widget.""" cursor = self.textEdit.textCursor() cursor.movePosition(QTextCursor.End) cursor.insertText(text) self.textEdit.setTextCursor(cursor) self.textEdit.ensureCursorVisible() def center(self): qr = self.frameGeometry() cp = QDesktopWidget().availableGeometry().center() qr.moveCenter(cp) self.move(qr.topLeft()) def closeEvent(self, event): reply = QMessageBox.question(self, '退出提示', "您确定要退出吗?", QMessageBox.Yes | QMessageBox.No, QMessageBox.No) if reply == QMessageBox.Yes: event.accept() elif reply == QMessageBox.No: event.ignore() def resource_path(relative): import os, sys if hasattr(sys, "_MEIPASS"): absolute_path = os.path.join(sys._MEIPASS, relative) else: absolute_path = os.path.join(relative) return absolute_path if __name__ == '__main__': import sys app = QtWidgets.QApplication(sys.argv) mainWindow = MainWindow() mainWindow.show() sys.exit(app.exec_())
5.其他代码,放到一个py文件里了 import requests import os from pypdf import PdfMerger, PdfReader import shutil import time import re header = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' } def handle_string(string): string = re.sub(r'\r\n *', '', str(string)) return string def merge_pdf(date, date_path, paper_name): pdf_list = os.listdir(date_path) pdf_merge = PdfMerger() pdf_list.sort() # 记录每次合并后的总页数 pdf_page_num = 0 for pdf in pdf_list: pdf_path = os.path.join(date_path, pdf) pdf_in = PdfReader(pdf_path, strict=False) # 每张报纸的标题 pdf_title = pdf.split('.')[0] pdf_merge.append(pdf_path) pdf_merge.add_outline_item(pdf_title, pdf_page_num, None) pdf_page_num += len(pdf_in.pages) pdf_merge.write(f'{paper_name}/{paper_name}({date}).pdf') print(f'{paper_name}({date}).pdf下载完成') pdf_merge.close() time.sleep(1) # 删除文件夹 shutil.rmtree(date_path) def make_dir(path): if os.path.exists(path): pass else: os.makedirs(path) def get_page(url): response = requests.get(url, headers=header) response.encoding = 'utf-8' if response.status_code == 200: return response.text
三、界面展示 四、不足之处 1.三个下载逻辑相似,可以集成到一个函数中调用 2.使用pypdf库合并pdf,会出现“Multiple definitions in dictionary at byte 0x13866c for key /Ascent”的信息,不过不影响使用 有知道怎样解决的大佬还望帮助解决
五、声明 此仅供学习研究使用,请勿用于其他用途 注:若转载请注明大神论坛来源(本贴地址)与作者信息。
|