Commit 8d5adcae authored by 邱阿朋's avatar 邱阿朋

爬虫处理

parent 34e8dcb0
......@@ -3,4 +3,7 @@
.venv
*.pyc
Return_Summary.xls
items
\ No newline at end of file
ContraCogsInvoices.xls
returns
invoices
coop
\ No newline at end of file
# coding: utf-8
# 回款明细
import os
import pandas as pd
from DrissionPage import ChromiumPage
from DrissionPage.errors import PageDisconnectedError
from helper import helper
page = ChromiumPage()
page.set.load_mode.normal()
page.set.when_download_file_exists('overwrite')
# 下载目录
download_path = os.getcwd()
# 检查下载目录是否存在,如果不存在则创建
helper.make_dir(download_path)
# 设置下载路径,确保在打开浏览器前设置
page.set.download_path(download_path)
def main():
page.get("https://vendorcentral.amazon.com/hz/vendor/members/coop?ref_=vc_xx_subNav")
# 全选
page.ele("#select-all").click()
# 点击选项卡
page.ele("#cc-invoice-actions-dropdown").click()
# 点击下载报表
mission = page.ele("#cc-invoice-actions-dropdown_2").click.to_download()
mission.wait()
file_name = "ContraCogsInvoices.xls"
coop_data = pd.read_excel(file_name, engine='xlrd')
for _, data in coop_data.iterrows():
# 根据回款id搜索下载报表
invoice_id = data.get("Invoice ID")
page.get(f"https://vendorcentral.amazon.com/hz/vendor/members/coop?searchText={invoice_id}")
# 点击选项卡
page.ele("#a-autoid-2-announce").click()
# 下载报表
file_name = f"coop\\{invoice_id}.csv"
page.ele(f"#invoiceDownloads-{invoice_id}_1").click.to_download(rename=file_name)
# invoice_data = pd.read_csv(file_name)
# for _, invoice, in invoice_data.iterrows():
# print(f"{invoice.to_dict()}")
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
pass
except PageDisconnectedError as e:
print("与页面的连接已断开")
# coding: utf-8
import os
import pandas as pd
import xlrd
from openpyxl.reader.excel import load_workbook
def open_xls(file_path):
# 开始处理excel数据
workbook = xlrd.open_workbook(filename=file_path)
# 选择工作表
return workbook.sheet_by_index(0) # 选择第一个工作表
def save_xls(data, output_file):
df = pd.DataFrame(data)
# 将 DataFrame 写入 Excel 文件
df.to_excel(output_file, index=False) # index=False 表示不写入行索引
# 使用 openpyxl 重新加载工作簿
wb = load_workbook(output_file)
ws = wb.active # 获取活动工作表
# 自动调整列宽
for column in ws.columns:
max_length = 0
# 获取列字母
column_letter = column[0].column_letter
for cell in column:
try:
if len(str(cell.value)) > max_length:
max_length = len(str(cell.value))
except:
pass
# 增加一些宽度以美观
adjusted_width = (max_length + 2)
ws.column_dimensions[column_letter].width = adjusted_width
# 保存调整后的工作簿
wb.save(output_file)
def make_dir(path):
# 检查下载目录是否存在,如果不存在则创建
if not os.path.exists(path):
os.makedirs(path)
return False
return True
def get_input_with_default(prompt, default):
user_input = input(f"{prompt}(默认为 '{default}'):")
return user_input or default
# coding: utf-8
# 回款明细
import os
import pandas as pd
from DrissionPage import ChromiumPage
from DrissionPage.errors import PageDisconnectedError
from helper import helper
page = ChromiumPage()
page.set.load_mode.normal()
page.set.when_download_file_exists('overwrite')
# 下载目录
download_path = os.getcwd()
# 检查下载目录是否存在,如果不存在则创建
helper.make_dir(download_path)
# 设置下载路径,确保在打开浏览器前设置
page.set.download_path(download_path)
def main():
po = "74HDTI2S"
page.get(
f"https://vendorcentral.amazon.com/hz/vendor/members/inv-mgmt/invoice-details?invoiceNumber={po}&payeeCode=VECET&activeTab=lineItems")
try:
invoice_dir = "invoices"
file_name = f"{invoice_dir}\\{po}.csv"
mission = page.ele("#line-items-export-to-spreadsheet-announce").click.to_download(rename=file_name)
mission.wait()
detail_data = pd.read_csv(file_name, skiprows=2)
for _, data, in detail_data.iterrows():
print(f"{data.to_dict()}")
except Exception as err:
print(f"读取 CSV 文件时出错: {err}")
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
pass
except PageDisconnectedError as e:
print("与页面的连接已断开")
......@@ -4,16 +4,27 @@ import os
import time
import pandas as pd
import xlrd
from DrissionPage import ChromiumPage
from DrissionPage.errors import PageDisconnectedError
from openpyxl.reader.excel import load_workbook
from helper import helper
email = None
password = None
page = ChromiumPage()
page.set.load_mode.normal() # 设置为normal模式
page.set.when_download_file_exists('overwrite')
# 下载目录
download_path = os.getcwd()
# 检查下载目录是否存在,如果不存在则创建
helper.make_dir(download_path)
# 设置下载路径,确保在打开浏览器前设置
page.set.download_path(download_path)
def open_url(page, url):
def open_url(url):
# 访问网页
page.get(url)
......@@ -36,28 +47,17 @@ def open_url(page, url):
def main():
page = ChromiumPage()
page.set.load_mode.normal() # 设置为normal模式
page.set.when_download_file_exists('overwrite')
# 下载目录
download_path = os.getcwd()
# 检查下载目录是否存在,如果不存在则创建
make_dir(download_path)
# 设置下载路径,确保在打开浏览器前设置
page.set.download_path(download_path)
# 读取asin和sku映射关系
relations_dict = asin_sku_relations()
# 下载并读取list数据
list_data = export_list(page)
list_data = export_list()
new_list_data = []
for _, data in list_data.iterrows():
return_id = data.get('Return ID')
# 下载退货详情表格读取数据
item_data = export_item(page, return_id)
item_data = export_item(return_id)
# 按 'Purchase order' 和 'ASIN' 分组,并对 'Quantity' 和 Total amount 进行求和
item_data_result = item_data.groupby(['Purchase order', 'ASIN'], as_index=False).agg({
'Quantity': 'sum',
......@@ -80,7 +80,7 @@ def main():
# 追加数据
new_list_data.append(data_dict)
save_xls(new_list_data, '退货明细.xlsx')
helper.save_xls(new_list_data, '退货明细.xlsx')
def asin_sku_relations():
......@@ -94,9 +94,9 @@ def asin_sku_relations():
return relations_dict
def export_list(page):
def export_list():
# 访问网页
open_url(page, "https://vendorcentral.amazon.com/hz/vendor/members/returns?ref_=vc_xx_subNav")
open_url("https://vendorcentral.amazon.com/hz/vendor/members/returns?ref_=vc_xx_subNav")
# 导出退货单
mission = page.ele("#file-download-button").click.to_download()
......@@ -104,14 +104,14 @@ def export_list(page):
return pd.read_excel('Return_Summary.xls', engine='xlrd')
def export_item(page, return_id):
items_dir = "items"
make_dir(items_dir)
def export_item(return_id):
returns_dir = "returns"
helper.make_dir(returns_dir)
file_name = f"{items_dir}\\{return_id}.xls"
file_name = f"{returns_dir}\\{return_id}.xls"
if not os.path.isfile(file_name):
# 打开退回详情下载明细
open_url(page, f"https://vendorcentral.amazon.com/katalmonsapp/vendor/members/returns/{return_id}")
open_url(f"https://vendorcentral.amazon.com/katalmonsapp/vendor/members/returns/{return_id}")
mission = page.ele("#file-download-button").click.to_download(rename=file_name)
mission.wait()
......@@ -119,60 +119,12 @@ def export_item(page, return_id):
return pd.read_excel(file_name, engine='xlrd')
def open_xls(file_path):
# 开始处理excel数据
workbook = xlrd.open_workbook(filename=file_path)
# 选择工作表
return workbook.sheet_by_index(0) # 选择第一个工作表
def save_xls(data, output_file):
df = pd.DataFrame(data)
# 将 DataFrame 写入 Excel 文件
df.to_excel(output_file, index=False) # index=False 表示不写入行索引
# 使用 openpyxl 重新加载工作簿
wb = load_workbook(output_file)
ws = wb.active # 获取活动工作表
# 自动调整列宽
for column in ws.columns:
max_length = 0
# 获取列字母
column_letter = column[0].column_letter
for cell in column:
try:
if len(str(cell.value)) > max_length:
max_length = len(str(cell.value))
except:
pass
# 增加一些宽度以美观
adjusted_width = (max_length + 2)
ws.column_dimensions[column_letter].width = adjusted_width
# 保存调整后的工作簿
wb.save(output_file)
def make_dir(path):
# 检查下载目录是否存在,如果不存在则创建
if not os.path.exists(path):
os.makedirs(path)
return False
return True
def get_input_with_default(prompt, default):
user_input = input(f"{prompt}(默认为 '{default}'):")
return user_input or default
if __name__ == '__main__':
try:
email = get_input_with_default("请输入账户", "us-cs001@khdtek.com")
print(f"您输入的账户是{email}")
password = get_input_with_default("请输入密码", "khd=20221208")
print(f"您输入的账户是:{password}")
email = helper.get_input_with_default("请输入账户", "us-cs001@khdtek.com")
print(f"您输入的账户是:{email}")
password = helper.get_input_with_default("请输入密码", "khd=20221208")
print(f"您输入的密码是:{password}")
main()
except KeyboardInterrupt:
......
......@@ -2,7 +2,7 @@
from huey import RedisHuey
import bootstrap
from models import Store, Goods
from models.models import Store, Goods
bootstrap.init()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment