Commit 34e8dcb0 authored by 邱阿朋's avatar 邱阿朋

退款明细爬虫处理

parent a8d37c4a
.idea
.vscode
.venv
*.pyc
\ No newline at end of file
*.pyc
Return_Summary.xls
items
\ No newline at end of file
import email
import imaplib
from email.header import decode_header
# 邮箱账户信息
username = 'us-cs001@khdtek.com' # 更换为你的邮箱
password = 'khd=20221208' # 更换为你的邮箱密码
imap_server = 'imap.qiye.aliyun.com'
def get_latest_unread_email():
# 连接到 IMAP 服务器
mail = imaplib.IMAP4_SSL(imap_server)
mail.login(username, password)
# 选择收件箱
mail.select("inbox")
# 搜索未读邮件
status, messages = mail.search(None, 'UNSEEN')
if status != 'OK':
print("没有找到未读邮件")
return
# 获取邮件 ID
email_ids = messages[0].split()
if not email_ids:
print("没有未读邮件")
return
# 获取最新的未读邮件 ID(最后一个 ID)
latest_email_id = email_ids[-1]
# 获取邮件内容
status, msg_data = mail.fetch(latest_email_id, '(RFC822)')
if status != 'OK':
print("无法获取邮件")
return
# 解析邮件内容
for response_part in msg_data:
if isinstance(response_part, tuple):
msg = email.message_from_bytes(response_part[1])
# 获取邮件主题
subject, encoding = decode_header(msg["Subject"])[0]
if isinstance(subject, bytes):
subject = subject.decode(encoding if encoding else "utf-8")
# 获取发件人
from_ = msg.get("From")
# 获取发送时间
date_ = msg.get("Date")
print(f"最新未读邮件主题: {subject}")
print(f"发件人: {from_}")
print(f"发送时间: {date_}")
# 获取邮件正文内容
if msg.is_multipart():
# 如果邮件是多部分的
for part in msg.walk():
content_type = part.get_content_type()
content_disposition = str(part.get("Content-Disposition"))
# 只处理文本或 HTML 内容
if content_type == "text/plain" and "attachment" not in content_disposition:
body = part.get_payload(decode=True).decode() # 解码邮件内容
print("邮件正文(纯文本):", body)
elif content_type == "text/html" and "attachment" not in content_disposition:
html_body = part.get_payload(decode=True).decode() # 解码邮件内容
print("邮件正文(HTML):", html_body)
else:
# 如果邮件不是多部分的
body = msg.get_payload(decode=True).decode()
print("邮件正文:", body)
# 登出
mail.logout()
# 执行获取最新未读邮件的操作
get_latest_unread_email()
File added
......@@ -6,4 +6,8 @@ SQLAlchemy==2.0.35
pymysql==1.1.1
huey==2.5.1
redis==5.0.8
selenium==4.25.0
\ No newline at end of file
selenium==4.25.0
requests==2.32.3
xlrd==2.0.1
pandas==2.2.3
openpyxl==3.1.5
\ No newline at end of file
# coding: utf-8
# 导出退款记录
import os
import time
import pandas as pd
import xlrd
from DrissionPage import ChromiumPage
from DrissionPage.errors import PageDisconnectedError
from openpyxl.reader.excel import load_workbook
email = None
password = None
def open_url(page, url):
# 访问网页
page.get(url)
element = page.ele('#ap_email', timeout=1)
if element:
page.ele('#ap_email').input(email)
page.ele('#continue').click()
page.ele('#ap_password').input(password)
page.ele('#signInSubmit').click()
# 判断是否有图形码
while True:
time.sleep(1)
if not page.ele('.a-section a-text-center cvf-captcha-img'):
break
print("请填入图形码内容")
# todo 识别图形码
def main():
page = ChromiumPage()
page.set.load_mode.normal() # 设置为normal模式
page.set.when_download_file_exists('overwrite')
# 下载目录
download_path = os.getcwd()
# 检查下载目录是否存在,如果不存在则创建
make_dir(download_path)
# 设置下载路径,确保在打开浏览器前设置
page.set.download_path(download_path)
# 读取asin和sku映射关系
relations_dict = asin_sku_relations()
# 下载并读取list数据
list_data = export_list(page)
new_list_data = []
for _, data in list_data.iterrows():
return_id = data.get('Return ID')
# 下载退货详情表格读取数据
item_data = export_item(page, return_id)
# 按 'Purchase order' 和 'ASIN' 分组,并对 'Quantity' 和 Total amount 进行求和
item_data_result = item_data.groupby(['Purchase order', 'ASIN'], as_index=False).agg({
'Quantity': 'sum',
'Total amount': 'sum',
})
for _, item_row in item_data_result.iterrows():
data_dict = data.to_dict()
data_dict['Return Date'] = data_dict['Return Date'].strftime('%m/%d/%Y')
data_dict['Return ID'] = str(data_dict['Return ID'])
data_dict['PO'] = item_row.get('Purchase order')
data_dict['ASIN'] = item_row.get('ASIN')
data_dict['SKU'] = relations_dict[item_row.get('ASIN')]
data_dict['Quantity'] = item_row.get('Quantity')
# 替换回会数量和金额为详情里面的值
data_dict['Return quantity'] = item_row.get('Quantity')
data_dict['Total cost'] = item_row.get('Total amount')
# 追加数据
new_list_data.append(data_dict)
save_xls(new_list_data, '退货明细.xlsx')
def asin_sku_relations():
relations_dict = {}
# 读取ASIN和sku映射关系
df = pd.read_excel('relations.xlsx')
for index, row in df.iterrows():
row_dict = row.to_dict()
relations_dict[row_dict['ASIN']] = row_dict['SKU']
return relations_dict
def export_list(page):
# 访问网页
open_url(page, "https://vendorcentral.amazon.com/hz/vendor/members/returns?ref_=vc_xx_subNav")
# 导出退货单
mission = page.ele("#file-download-button").click.to_download()
mission.wait()
return pd.read_excel('Return_Summary.xls', engine='xlrd')
def export_item(page, return_id):
items_dir = "items"
make_dir(items_dir)
file_name = f"{items_dir}\\{return_id}.xls"
if not os.path.isfile(file_name):
# 打开退回详情下载明细
open_url(page, f"https://vendorcentral.amazon.com/katalmonsapp/vendor/members/returns/{return_id}")
mission = page.ele("#file-download-button").click.to_download(rename=file_name)
mission.wait()
# 读取回退商品详情
return pd.read_excel(file_name, engine='xlrd')
def open_xls(file_path):
# 开始处理excel数据
workbook = xlrd.open_workbook(filename=file_path)
# 选择工作表
return workbook.sheet_by_index(0) # 选择第一个工作表
def save_xls(data, output_file):
df = pd.DataFrame(data)
# 将 DataFrame 写入 Excel 文件
df.to_excel(output_file, index=False) # index=False 表示不写入行索引
# 使用 openpyxl 重新加载工作簿
wb = load_workbook(output_file)
ws = wb.active # 获取活动工作表
# 自动调整列宽
for column in ws.columns:
max_length = 0
# 获取列字母
column_letter = column[0].column_letter
for cell in column:
try:
if len(str(cell.value)) > max_length:
max_length = len(str(cell.value))
except:
pass
# 增加一些宽度以美观
adjusted_width = (max_length + 2)
ws.column_dimensions[column_letter].width = adjusted_width
# 保存调整后的工作簿
wb.save(output_file)
def make_dir(path):
# 检查下载目录是否存在,如果不存在则创建
if not os.path.exists(path):
os.makedirs(path)
return False
return True
def get_input_with_default(prompt, default):
user_input = input(f"{prompt}(默认为 '{default}'):")
return user_input or default
if __name__ == '__main__':
try:
email = get_input_with_default("请输入账户", "us-cs001@khdtek.com")
print(f"您输入的账户是:{email}")
password = get_input_with_default("请输入密码", "khd=20221208")
print(f"您输入的账户是:{password}")
main()
except KeyboardInterrupt:
pass
except PageDisconnectedError as e:
print("与页面的连接已断开")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment