Commit 8d5adcae authored by 邱阿朋's avatar 邱阿朋

爬虫处理

parent 34e8dcb0
...@@ -3,4 +3,7 @@ ...@@ -3,4 +3,7 @@
.venv .venv
*.pyc *.pyc
Return_Summary.xls Return_Summary.xls
items ContraCogsInvoices.xls
\ No newline at end of file returns
invoices
coop
\ No newline at end of file
# coding: utf-8
# 回款明细
import os
import pandas as pd
from DrissionPage import ChromiumPage
from DrissionPage.errors import PageDisconnectedError
from helper import helper
page = ChromiumPage()
page.set.load_mode.normal()
page.set.when_download_file_exists('overwrite')
# 下载目录
download_path = os.getcwd()
# 检查下载目录是否存在,如果不存在则创建
helper.make_dir(download_path)
# 设置下载路径,确保在打开浏览器前设置
page.set.download_path(download_path)
def main():
page.get("https://vendorcentral.amazon.com/hz/vendor/members/coop?ref_=vc_xx_subNav")
# 全选
page.ele("#select-all").click()
# 点击选项卡
page.ele("#cc-invoice-actions-dropdown").click()
# 点击下载报表
mission = page.ele("#cc-invoice-actions-dropdown_2").click.to_download()
mission.wait()
file_name = "ContraCogsInvoices.xls"
coop_data = pd.read_excel(file_name, engine='xlrd')
for _, data in coop_data.iterrows():
# 根据回款id搜索下载报表
invoice_id = data.get("Invoice ID")
page.get(f"https://vendorcentral.amazon.com/hz/vendor/members/coop?searchText={invoice_id}")
# 点击选项卡
page.ele("#a-autoid-2-announce").click()
# 下载报表
file_name = f"coop\\{invoice_id}.csv"
page.ele(f"#invoiceDownloads-{invoice_id}_1").click.to_download(rename=file_name)
# invoice_data = pd.read_csv(file_name)
# for _, invoice, in invoice_data.iterrows():
# print(f"{invoice.to_dict()}")
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
pass
except PageDisconnectedError as e:
print("与页面的连接已断开")
# coding: utf-8
import os
import pandas as pd
import xlrd
from openpyxl.reader.excel import load_workbook
def open_xls(file_path):
# 开始处理excel数据
workbook = xlrd.open_workbook(filename=file_path)
# 选择工作表
return workbook.sheet_by_index(0) # 选择第一个工作表
def save_xls(data, output_file):
df = pd.DataFrame(data)
# 将 DataFrame 写入 Excel 文件
df.to_excel(output_file, index=False) # index=False 表示不写入行索引
# 使用 openpyxl 重新加载工作簿
wb = load_workbook(output_file)
ws = wb.active # 获取活动工作表
# 自动调整列宽
for column in ws.columns:
max_length = 0
# 获取列字母
column_letter = column[0].column_letter
for cell in column:
try:
if len(str(cell.value)) > max_length:
max_length = len(str(cell.value))
except:
pass
# 增加一些宽度以美观
adjusted_width = (max_length + 2)
ws.column_dimensions[column_letter].width = adjusted_width
# 保存调整后的工作簿
wb.save(output_file)
def make_dir(path):
# 检查下载目录是否存在,如果不存在则创建
if not os.path.exists(path):
os.makedirs(path)
return False
return True
def get_input_with_default(prompt, default):
user_input = input(f"{prompt}(默认为 '{default}'):")
return user_input or default
# coding: utf-8
# 回款明细
import os
import pandas as pd
from DrissionPage import ChromiumPage
from DrissionPage.errors import PageDisconnectedError
from helper import helper
page = ChromiumPage()
page.set.load_mode.normal()
page.set.when_download_file_exists('overwrite')
# 下载目录
download_path = os.getcwd()
# 检查下载目录是否存在,如果不存在则创建
helper.make_dir(download_path)
# 设置下载路径,确保在打开浏览器前设置
page.set.download_path(download_path)
def main():
po = "74HDTI2S"
page.get(
f"https://vendorcentral.amazon.com/hz/vendor/members/inv-mgmt/invoice-details?invoiceNumber={po}&payeeCode=VECET&activeTab=lineItems")
try:
invoice_dir = "invoices"
file_name = f"{invoice_dir}\\{po}.csv"
mission = page.ele("#line-items-export-to-spreadsheet-announce").click.to_download(rename=file_name)
mission.wait()
detail_data = pd.read_csv(file_name, skiprows=2)
for _, data, in detail_data.iterrows():
print(f"{data.to_dict()}")
except Exception as err:
print(f"读取 CSV 文件时出错: {err}")
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
pass
except PageDisconnectedError as e:
print("与页面的连接已断开")
...@@ -4,16 +4,27 @@ import os ...@@ -4,16 +4,27 @@ import os
import time import time
import pandas as pd import pandas as pd
import xlrd
from DrissionPage import ChromiumPage from DrissionPage import ChromiumPage
from DrissionPage.errors import PageDisconnectedError from DrissionPage.errors import PageDisconnectedError
from openpyxl.reader.excel import load_workbook
from helper import helper
email = None email = None
password = None password = None
page = ChromiumPage()
page.set.load_mode.normal() # 设置为normal模式
page.set.when_download_file_exists('overwrite')
# 下载目录
download_path = os.getcwd()
# 检查下载目录是否存在,如果不存在则创建
helper.make_dir(download_path)
# 设置下载路径,确保在打开浏览器前设置
page.set.download_path(download_path)
def open_url(page, url):
def open_url(url):
# 访问网页 # 访问网页
page.get(url) page.get(url)
...@@ -36,28 +47,17 @@ def open_url(page, url): ...@@ -36,28 +47,17 @@ def open_url(page, url):
def main(): def main():
page = ChromiumPage()
page.set.load_mode.normal() # 设置为normal模式
page.set.when_download_file_exists('overwrite')
# 下载目录
download_path = os.getcwd()
# 检查下载目录是否存在,如果不存在则创建
make_dir(download_path)
# 设置下载路径,确保在打开浏览器前设置
page.set.download_path(download_path)
# 读取asin和sku映射关系 # 读取asin和sku映射关系
relations_dict = asin_sku_relations() relations_dict = asin_sku_relations()
# 下载并读取list数据 # 下载并读取list数据
list_data = export_list(page) list_data = export_list()
new_list_data = [] new_list_data = []
for _, data in list_data.iterrows(): for _, data in list_data.iterrows():
return_id = data.get('Return ID') return_id = data.get('Return ID')
# 下载退货详情表格读取数据 # 下载退货详情表格读取数据
item_data = export_item(page, return_id) item_data = export_item(return_id)
# 按 'Purchase order' 和 'ASIN' 分组,并对 'Quantity' 和 Total amount 进行求和 # 按 'Purchase order' 和 'ASIN' 分组,并对 'Quantity' 和 Total amount 进行求和
item_data_result = item_data.groupby(['Purchase order', 'ASIN'], as_index=False).agg({ item_data_result = item_data.groupby(['Purchase order', 'ASIN'], as_index=False).agg({
'Quantity': 'sum', 'Quantity': 'sum',
...@@ -80,7 +80,7 @@ def main(): ...@@ -80,7 +80,7 @@ def main():
# 追加数据 # 追加数据
new_list_data.append(data_dict) new_list_data.append(data_dict)
save_xls(new_list_data, '退货明细.xlsx') helper.save_xls(new_list_data, '退货明细.xlsx')
def asin_sku_relations(): def asin_sku_relations():
...@@ -94,9 +94,9 @@ def asin_sku_relations(): ...@@ -94,9 +94,9 @@ def asin_sku_relations():
return relations_dict return relations_dict
def export_list(page): def export_list():
# 访问网页 # 访问网页
open_url(page, "https://vendorcentral.amazon.com/hz/vendor/members/returns?ref_=vc_xx_subNav") open_url("https://vendorcentral.amazon.com/hz/vendor/members/returns?ref_=vc_xx_subNav")
# 导出退货单 # 导出退货单
mission = page.ele("#file-download-button").click.to_download() mission = page.ele("#file-download-button").click.to_download()
...@@ -104,14 +104,14 @@ def export_list(page): ...@@ -104,14 +104,14 @@ def export_list(page):
return pd.read_excel('Return_Summary.xls', engine='xlrd') return pd.read_excel('Return_Summary.xls', engine='xlrd')
def export_item(page, return_id): def export_item(return_id):
items_dir = "items" returns_dir = "returns"
make_dir(items_dir) helper.make_dir(returns_dir)
file_name = f"{items_dir}\\{return_id}.xls" file_name = f"{returns_dir}\\{return_id}.xls"
if not os.path.isfile(file_name): if not os.path.isfile(file_name):
# 打开退回详情下载明细 # 打开退回详情下载明细
open_url(page, f"https://vendorcentral.amazon.com/katalmonsapp/vendor/members/returns/{return_id}") open_url(f"https://vendorcentral.amazon.com/katalmonsapp/vendor/members/returns/{return_id}")
mission = page.ele("#file-download-button").click.to_download(rename=file_name) mission = page.ele("#file-download-button").click.to_download(rename=file_name)
mission.wait() mission.wait()
...@@ -119,60 +119,12 @@ def export_item(page, return_id): ...@@ -119,60 +119,12 @@ def export_item(page, return_id):
return pd.read_excel(file_name, engine='xlrd') return pd.read_excel(file_name, engine='xlrd')
def open_xls(file_path):
# 开始处理excel数据
workbook = xlrd.open_workbook(filename=file_path)
# 选择工作表
return workbook.sheet_by_index(0) # 选择第一个工作表
def save_xls(data, output_file):
df = pd.DataFrame(data)
# 将 DataFrame 写入 Excel 文件
df.to_excel(output_file, index=False) # index=False 表示不写入行索引
# 使用 openpyxl 重新加载工作簿
wb = load_workbook(output_file)
ws = wb.active # 获取活动工作表
# 自动调整列宽
for column in ws.columns:
max_length = 0
# 获取列字母
column_letter = column[0].column_letter
for cell in column:
try:
if len(str(cell.value)) > max_length:
max_length = len(str(cell.value))
except:
pass
# 增加一些宽度以美观
adjusted_width = (max_length + 2)
ws.column_dimensions[column_letter].width = adjusted_width
# 保存调整后的工作簿
wb.save(output_file)
def make_dir(path):
# 检查下载目录是否存在,如果不存在则创建
if not os.path.exists(path):
os.makedirs(path)
return False
return True
def get_input_with_default(prompt, default):
user_input = input(f"{prompt}(默认为 '{default}'):")
return user_input or default
if __name__ == '__main__': if __name__ == '__main__':
try: try:
email = get_input_with_default("请输入账户", "us-cs001@khdtek.com") email = helper.get_input_with_default("请输入账户", "us-cs001@khdtek.com")
print(f"您输入的账户是{email}") print(f"您输入的账户是:{email}")
password = get_input_with_default("请输入密码", "khd=20221208") password = helper.get_input_with_default("请输入密码", "khd=20221208")
print(f"您输入的账户是:{password}") print(f"您输入的密码是:{password}")
main() main()
except KeyboardInterrupt: except KeyboardInterrupt:
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
from huey import RedisHuey from huey import RedisHuey
import bootstrap import bootstrap
from models import Store, Goods from models.models import Store, Goods
bootstrap.init() bootstrap.init()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment