Commit ccca3f9a authored by 邱阿朋's avatar 邱阿朋

爬虫处理

parent 82a37d66
......@@ -4,6 +4,7 @@
*.pyc
Return_Summary.xls
ContraCogsInvoices.xls
Payments.xlsx
returns
invoices
coop
\ No newline at end of file
......@@ -3,16 +3,12 @@
import os
import pandas as pd
from DrissionPage import ChromiumPage, ChromiumOptions
from DrissionPage import ChromiumPage
from DrissionPage.errors import PageDisconnectedError
from helper import helper
# 创建配置对象(默认从 ini 文件中读取配置)
co = ChromiumOptions()
# 设置不加载图片、静音
co.no_imgs(True).mute(True)
page = ChromiumPage(addr_or_opts=co)
page = ChromiumPage()
page.set.load_mode.eager()
page.set.when_download_file_exists('overwrite')
......@@ -31,8 +27,8 @@ def main():
# 点击选项卡
page.ele("#cc-invoice-actions-dropdown").click()
# 点击下载报表
mission = page.ele("#cc-invoice-actions-dropdown_2").click.to_download()
mission.wait()
page.ele("#cc-invoice-actions-dropdown_2").click.to_download()
page.download.wait()
file_name = "ContraCogsInvoices.xls"
coop_data = pd.read_excel(file_name, engine='xlrd')
......
# coding: utf-8
import pandas as pd
import xlrd
from openpyxl.reader.excel import load_workbook
def open_xls(file_path):
# 开始处理excel数据
workbook = xlrd.open_workbook(filename=file_path)
# 选择工作表
return workbook.sheet_by_index(0) # 选择第一个工作表
def save_xls(data, output_file):
df = pd.DataFrame(data)
# 将 DataFrame 写入 Excel 文件
df.to_excel(output_file, index=False) # index=False 表示不写入行索引
# 使用 openpyxl 重新加载工作簿
wb = load_workbook(output_file)
ws = wb.active # 获取活动工作表
# 自动调整列宽
for column in ws.columns:
max_length = 0
# 获取列字母
column_letter = column[0].column_letter
for cell in column:
try:
if len(str(cell.value)) > max_length:
max_length = len(str(cell.value))
except:
pass
# 增加一些宽度以美观
adjusted_width = (max_length + 2)
ws.column_dimensions[column_letter].width = adjusted_width
# 保存调整后的工作簿
wb.save(output_file)
# coding: utf-8
import os
import time
def wait_for_downloads(download_dir, timeout=60):
"""
监控下载目录,等待新文件下载完成。
:param download_dir: 文件下载目录
:param timeout: 超时时间,单位:秒
"""
end_time = time.time() + timeout
while time.time() < end_time:
files = os.listdir(download_dir)
if files: # 如果文件夹内有文件
return True
time.sleep(1)
return False
def make_dir(path):
# 检查下载目录是否存在,如果不存在则创建
if not os.path.exists(path):
os.makedirs(path)
return False
return True
# coding: utf-8
import os
import pandas as pd
import xlrd
from openpyxl.reader.excel import load_workbook
def open_xls(file_path):
# 开始处理excel数据
workbook = xlrd.open_workbook(filename=file_path)
# 选择工作表
return workbook.sheet_by_index(0) # 选择第一个工作表
def save_xls(data, output_file):
df = pd.DataFrame(data)
# 将 DataFrame 写入 Excel 文件
df.to_excel(output_file, index=False) # index=False 表示不写入行索引
# 使用 openpyxl 重新加载工作簿
wb = load_workbook(output_file)
ws = wb.active # 获取活动工作表
# 自动调整列宽
for column in ws.columns:
max_length = 0
# 获取列字母
column_letter = column[0].column_letter
for cell in column:
try:
if len(str(cell.value)) > max_length:
max_length = len(str(cell.value))
except:
pass
# 增加一些宽度以美观
adjusted_width = (max_length + 2)
ws.column_dimensions[column_letter].width = adjusted_width
# 保存调整后的工作簿
wb.save(output_file)
def make_dir(path):
# 检查下载目录是否存在,如果不存在则创建
......
# coding: utf-8
# 回款明细
import os
import time
import urllib.parse
import warnings
import pandas as pd
from DrissionPage import ChromiumPage
......@@ -18,23 +21,67 @@ helper.make_dir(download_path)
# 设置下载路径,确保在打开浏览器前设置
page.set.download_path(download_path)
# 忽略 openpyxl 样式警告
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")
# 对过滤后的数据,进一步处理 Description 列
def process_description(description):
# 按空格分割最后一段
parts = description.split('/')
# 检查分割后的最后一个部分的前面是否为8位数
if len(parts) > 0 and len(parts[-1]) >= 8:
# 返回分割后的前8位
return parts[-1][:8]
return None
def download_filter_data():
file_name = 'Payments.xlsx'
if not os.path.isfile(file_name):
page.get(f"https://vendorcentral.amazon.com/hz/vendor/members/remittance/home")
page.ele("#remittance-home-select-all").click()
page.ele("#remittance-home-export-link").click.to_download()
page.download.wait()
df = pd.read_excel('Payments.xlsx', skiprows=22)
# 定义正则表达式模式,匹配包含 'Price' 或 'PCR' 或 'XXXXXXXX/XXXX/' 的描述
pattern = r'Price|PCR|Missed|Shortage|^[A-Z0-9]{8}/[A-Z0-9]{4}/'
# 过滤符合条件的行
return df[df['Description'].str.contains(pattern, na=False, regex=True)]
def main():
po = "74HDTI2S"
list_data = download_filter_data()
for _, data in list_data.iterrows():
invoice_number = data.get("Invoice Number")
description = data.get("Description")
if "Price" in description or "PCR" in description or "Missed" in description or "Shortage" in description:
# 获取前8位
invoice_number = invoice_number[:8]
print(invoice_number)
page.get(
f"https://vendorcentral.amazon.com/hz/vendor/members/inv-mgmt/invoice-details?invoiceNumber={po}&payeeCode=VECET&activeTab=lineItems")
try:
invoice_dir = "invoices"
file_name = f"{invoice_dir}\\{po}.csv"
mission = page.ele("#line-items-export-to-spreadsheet-announce").click.to_download(rename=file_name)
mission.wait()
f"https://vendorcentral.amazon.com/hz/vendor/members/inv-mgmt/invoice-po-search?searchByNumberToken={invoice_number}")
params = {
"invoiceNumber": invoice_number,
"payeeCode": "VECET",
"activeTab": "lineItems",
}
# 将字典转换为 URL 查询参数
query_string = urllib.parse.urlencode(params)
page.get(f"https://vendorcentral.amazon.com/hz/vendor/members/inv-mgmt/invoice-details?" + query_string)
detail_data = pd.read_csv(file_name, skiprows=2)
for _, data, in detail_data.iterrows():
print(f"{data.to_dict()}")
# 读取详情内容
file_name = f"invoices\\{invoice_number}.csv"
page.ele("#line-items-export-to-spreadsheet-announce").click.to_download(rename=file_name)
time.sleep(3)
except Exception as err:
print(f"读取 CSV 文件时出错: {err}")
detail_data = pd.read_csv(file_name, skiprows=2, engine='python', on_bad_lines='skip')
# for _, detail, in detail_data.iterrows():
# print(f"{detail.to_dict()}")
if __name__ == '__main__':
......
......@@ -4,19 +4,15 @@ import os
import time
import pandas as pd
from DrissionPage import ChromiumPage, ChromiumOptions
from DrissionPage import ChromiumPage
from DrissionPage.errors import PageDisconnectedError
from helper import helper
from helper import helper, excel
email = None
password = None
# 创建配置对象(默认从 ini 文件中读取配置)
co = ChromiumOptions()
# 设置不加载图片、静音
co.no_imgs(True).mute(True)
page = ChromiumPage(addr_or_opts=co)
page = ChromiumPage()
page.set.load_mode.eager()
page.set.when_download_file_exists('overwrite')
......@@ -84,7 +80,8 @@ def main():
# 追加数据
new_list_data.append(data_dict)
helper.save_xls(new_list_data, '退货明细.xlsx')
excel.save_xls(new_list_data, '退货明细.xlsx')
page.close()
def asin_sku_relations():
......@@ -103,8 +100,8 @@ def export_list():
open_url("https://vendorcentral.amazon.com/hz/vendor/members/returns?ref_=vc_xx_subNav")
# 导出退货单
mission = page.ele("#file-download-button").click.to_download()
mission.wait()
page.ele("#file-download-button").click.to_download()
page.download.wait()
return pd.read_excel('Return_Summary.xls', engine='xlrd')
......@@ -116,8 +113,8 @@ def export_item(return_id):
if not os.path.isfile(file_name):
# 打开退回详情下载明细
open_url(f"https://vendorcentral.amazon.com/katalmonsapp/vendor/members/returns/{return_id}")
mission = page.ele("#file-download-button").click.to_download(rename=file_name)
mission.wait()
page.ele("#file-download-button").click.to_download(rename=file_name)
page.download.wait()
# 读取回退商品详情
return pd.read_excel(file_name, engine='xlrd')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment