退款明细爬虫处理

34e8dcb0 · 邱阿朋 · a8d37c4a · 34e8dcb0 · 34e8dcb0 · 34e8dcb0
Commit 34e8dcb0 authored Oct 16, 2024 by 邱阿朋
Showing with 274 additions and 2 deletions

.gitignore .gitignore +3 -1

fetch_email.py example/fetch_email.py +85 -0

relations.xlsx relations.xlsx +0 -0

requirements.txt requirements.txt +5 -1

return_reports.py return_reports.py +181 -0

No files found.
--- a/.gitignore
+++ b/.gitignore
 .idea
 .vscode
 .venv
-*.pyc
\ No newline at end of file
+*.pyc
+Return_Summary.xls
+items
\ No newline at end of file
--- a/example/fetch_email.py
+++ b/example/fetch_email.py
+import email
+import imaplib
+from email.header import decode_header
+
+# 邮箱账户信息
+username = 'us-cs001@khdtek.com'  # 更换为你的邮箱
+password = 'khd=20221208'  # 更换为你的邮箱密码
+imap_server = 'imap.qiye.aliyun.com'
+
+
+def get_latest_unread_email():
+    # 连接到 IMAP 服务器
+    mail = imaplib.IMAP4_SSL(imap_server)
+    mail.login(username, password)
+
+    # 选择收件箱
+    mail.select("inbox")
+
+    # 搜索未读邮件
+    status, messages = mail.search(None, 'UNSEEN')
+    if status != 'OK':
+        print("没有找到未读邮件")
+        return
+
+    # 获取邮件 ID
+    email_ids = messages[0].split()
+
+    if not email_ids:
+        print("没有未读邮件")
+        return
+
+    # 获取最新的未读邮件 ID（最后一个 ID）
+    latest_email_id = email_ids[-1]
+
+    # 获取邮件内容
+    status, msg_data = mail.fetch(latest_email_id, '(RFC822)')
+    if status != 'OK':
+        print("无法获取邮件")
+        return
+
+    # 解析邮件内容
+    for response_part in msg_data:
+        if isinstance(response_part, tuple):
+            msg = email.message_from_bytes(response_part[1])
+
+            # 获取邮件主题
+            subject, encoding = decode_header(msg["Subject"])[0]
+            if isinstance(subject, bytes):
+                subject = subject.decode(encoding if encoding else "utf-8")
+
+            # 获取发件人
+            from_ = msg.get("From")
+
+            # 获取发送时间
+            date_ = msg.get("Date")
+
+            print(f"最新未读邮件主题: {subject}")
+            print(f"发件人: {from_}")
+            print(f"发送时间: {date_}")
+
+            # 获取邮件正文内容
+            if msg.is_multipart():
+                # 如果邮件是多部分的
+                for part in msg.walk():
+                    content_type = part.get_content_type()
+                    content_disposition = str(part.get("Content-Disposition"))
+
+                    # 只处理文本或 HTML 内容
+                    if content_type == "text/plain" and "attachment" not in content_disposition:
+                        body = part.get_payload(decode=True).decode()  # 解码邮件内容
+                        print("邮件正文（纯文本）:", body)
+                    elif content_type == "text/html" and "attachment" not in content_disposition:
+                        html_body = part.get_payload(decode=True).decode()  # 解码邮件内容
+                        print("邮件正文（HTML）:", html_body)
+            else:
+                # 如果邮件不是多部分的
+                body = msg.get_payload(decode=True).decode()
+                print("邮件正文:", body)
+
+    # 登出
+    mail.logout()
+
+
+# 执行获取最新未读邮件的操作
+get_latest_unread_email()
--- a/relations.xlsx
+++ b/relations.xlsx
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,4 +6,8 @@ SQLAlchemy==2.0.35
 pymysql==1.1.1
 huey==2.5.1
 redis==5.0.8
-selenium==4.25.0
\ No newline at end of file
+selenium==4.25.0
+requests==2.32.3
+xlrd==2.0.1
+pandas==2.2.3
+openpyxl==3.1.5
\ No newline at end of file
--- a/return_reports.py
+++ b/return_reports.py
+# coding: utf-8
+# 导出退款记录
+import os
+import time
+
+import pandas as pd
+import xlrd
+from DrissionPage import ChromiumPage
+from DrissionPage.errors import PageDisconnectedError
+from openpyxl.reader.excel import load_workbook
+
+email = None
+password = None
+
+
+def open_url(page, url):
+    # 访问网页
+    page.get(url)
+
+    element = page.ele('#ap_email', timeout=1)
+    if element:
+        page.ele('#ap_email').input(email)
+        page.ele('#continue').click()
+        page.ele('#ap_password').input(password)
+        page.ele('#signInSubmit').click()
+
+    # 判断是否有图形码
+    while True:
+        time.sleep(1)
+
+        if not page.ele('.a-section a-text-center cvf-captcha-img'):
+            break
+
+        print("请填入图形码内容")
+        # todo 识别图形码
+
+
+def main():
+    page = ChromiumPage()
+    page.set.load_mode.normal()  # 设置为normal模式
+    page.set.when_download_file_exists('overwrite')
+
+    # 下载目录
+    download_path = os.getcwd()
+    # 检查下载目录是否存在，如果不存在则创建
+    make_dir(download_path)
+    # 设置下载路径，确保在打开浏览器前设置
+    page.set.download_path(download_path)
+
+    # 读取asin和sku映射关系
+    relations_dict = asin_sku_relations()
+
+    # 下载并读取list数据
+    list_data = export_list(page)
+
+    new_list_data = []
+    for _, data in list_data.iterrows():
+        return_id = data.get('Return ID')
+        # 下载退货详情表格读取数据
+        item_data = export_item(page, return_id)
+        # 按 'Purchase order' 和 'ASIN' 分组，并对 'Quantity' 和 Total amount 进行求和
+        item_data_result = item_data.groupby(['Purchase order', 'ASIN'], as_index=False).agg({
+            'Quantity': 'sum',
+            'Total amount': 'sum',
+        })
+
+        for _, item_row in item_data_result.iterrows():
+            data_dict = data.to_dict()
+            data_dict['Return Date'] = data_dict['Return Date'].strftime('%m/%d/%Y')
+            data_dict['Return ID'] = str(data_dict['Return ID'])
+            data_dict['PO'] = item_row.get('Purchase order')
+            data_dict['ASIN'] = item_row.get('ASIN')
+            data_dict['SKU'] = relations_dict[item_row.get('ASIN')]
+            data_dict['Quantity'] = item_row.get('Quantity')
+
+            # 替换回会数量和金额为详情里面的值
+            data_dict['Return quantity'] = item_row.get('Quantity')
+            data_dict['Total cost'] = item_row.get('Total amount')
+
+            # 追加数据
+            new_list_data.append(data_dict)
+
+    save_xls(new_list_data, '退货明细.xlsx')
+
+
+def asin_sku_relations():
+    relations_dict = {}
+    # 读取ASIN和sku映射关系
+    df = pd.read_excel('relations.xlsx')
+    for index, row in df.iterrows():
+        row_dict = row.to_dict()
+        relations_dict[row_dict['ASIN']] = row_dict['SKU']
+
+    return relations_dict
+
+
+def export_list(page):
+    # 访问网页
+    open_url(page, "https://vendorcentral.amazon.com/hz/vendor/members/returns?ref_=vc_xx_subNav")
+
+    # 导出退货单
+    mission = page.ele("#file-download-button").click.to_download()
+    mission.wait()
+    return pd.read_excel('Return_Summary.xls', engine='xlrd')
+
+
+def export_item(page, return_id):
+    items_dir = "items"
+    make_dir(items_dir)
+
+    file_name = f"{items_dir}\\{return_id}.xls"
+    if not os.path.isfile(file_name):
+        # 打开退回详情下载明细
+        open_url(page, f"https://vendorcentral.amazon.com/katalmonsapp/vendor/members/returns/{return_id}")
+        mission = page.ele("#file-download-button").click.to_download(rename=file_name)
+        mission.wait()
+
+    # 读取回退商品详情
+    return pd.read_excel(file_name, engine='xlrd')
+
+
+def open_xls(file_path):
+    # 开始处理excel数据
+    workbook = xlrd.open_workbook(filename=file_path)
+    # 选择工作表
+    return workbook.sheet_by_index(0)  # 选择第一个工作表
+
+
+def save_xls(data, output_file):
+    df = pd.DataFrame(data)
+    # 将 DataFrame 写入 Excel 文件
+    df.to_excel(output_file, index=False)  # index=False 表示不写入行索引
+    # 使用 openpyxl 重新加载工作簿
+    wb = load_workbook(output_file)
+    ws = wb.active  # 获取活动工作表
+
+    # 自动调整列宽
+    for column in ws.columns:
+        max_length = 0
+        # 获取列字母
+        column_letter = column[0].column_letter
+        for cell in column:
+            try:
+                if len(str(cell.value)) > max_length:
+                    max_length = len(str(cell.value))
+            except:
+                pass
+        # 增加一些宽度以美观
+        adjusted_width = (max_length + 2)
+        ws.column_dimensions[column_letter].width = adjusted_width
+
+    # 保存调整后的工作簿
+    wb.save(output_file)
+
+
+def make_dir(path):
+    # 检查下载目录是否存在，如果不存在则创建
+    if not os.path.exists(path):
+        os.makedirs(path)
+        return False
+
+    return True
+
+
+def get_input_with_default(prompt, default):
+    user_input = input(f"{prompt}（默认为 '{default}'）：")
+    return user_input or default
+
+
+if __name__ == '__main__':
+    try:
+        email = get_input_with_default("请输入账户", "us-cs001@khdtek.com")
+        print(f"您输入的账户是：{email}")
+        password = get_input_with_default("请输入密码", "khd=20221208")
+        print(f"您输入的账户是：{password}")
+
+        main()
+    except KeyboardInterrupt:
+        pass
+    except PageDisconnectedError as e:
+        print("与页面的连接已断开")