Commit ccca3f9a authored by 邱阿朋's avatar 邱阿朋

爬虫处理

parent 82a37d66
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
*.pyc *.pyc
Return_Summary.xls Return_Summary.xls
ContraCogsInvoices.xls ContraCogsInvoices.xls
Payments.xlsx
returns returns
invoices invoices
coop coop
\ No newline at end of file
...@@ -3,16 +3,12 @@ ...@@ -3,16 +3,12 @@
import os import os
import pandas as pd import pandas as pd
from DrissionPage import ChromiumPage, ChromiumOptions from DrissionPage import ChromiumPage
from DrissionPage.errors import PageDisconnectedError from DrissionPage.errors import PageDisconnectedError
from helper import helper from helper import helper
# 创建配置对象(默认从 ini 文件中读取配置) page = ChromiumPage()
co = ChromiumOptions()
# 设置不加载图片、静音
co.no_imgs(True).mute(True)
page = ChromiumPage(addr_or_opts=co)
page.set.load_mode.eager() page.set.load_mode.eager()
page.set.when_download_file_exists('overwrite') page.set.when_download_file_exists('overwrite')
...@@ -31,8 +27,8 @@ def main(): ...@@ -31,8 +27,8 @@ def main():
# 点击选项卡 # 点击选项卡
page.ele("#cc-invoice-actions-dropdown").click() page.ele("#cc-invoice-actions-dropdown").click()
# 点击下载报表 # 点击下载报表
mission = page.ele("#cc-invoice-actions-dropdown_2").click.to_download() page.ele("#cc-invoice-actions-dropdown_2").click.to_download()
mission.wait() page.download.wait()
file_name = "ContraCogsInvoices.xls" file_name = "ContraCogsInvoices.xls"
coop_data = pd.read_excel(file_name, engine='xlrd') coop_data = pd.read_excel(file_name, engine='xlrd')
......
# coding: utf-8
import pandas as pd
import xlrd
from openpyxl.reader.excel import load_workbook
def open_xls(file_path):
# 开始处理excel数据
workbook = xlrd.open_workbook(filename=file_path)
# 选择工作表
return workbook.sheet_by_index(0) # 选择第一个工作表
def save_xls(data, output_file):
df = pd.DataFrame(data)
# 将 DataFrame 写入 Excel 文件
df.to_excel(output_file, index=False) # index=False 表示不写入行索引
# 使用 openpyxl 重新加载工作簿
wb = load_workbook(output_file)
ws = wb.active # 获取活动工作表
# 自动调整列宽
for column in ws.columns:
max_length = 0
# 获取列字母
column_letter = column[0].column_letter
for cell in column:
try:
if len(str(cell.value)) > max_length:
max_length = len(str(cell.value))
except:
pass
# 增加一些宽度以美观
adjusted_width = (max_length + 2)
ws.column_dimensions[column_letter].width = adjusted_width
# 保存调整后的工作簿
wb.save(output_file)
# coding: utf-8
import os
import time
def wait_for_downloads(download_dir, timeout=60):
"""
监控下载目录,等待新文件下载完成。
:param download_dir: 文件下载目录
:param timeout: 超时时间,单位:秒
"""
end_time = time.time() + timeout
while time.time() < end_time:
files = os.listdir(download_dir)
if files: # 如果文件夹内有文件
return True
time.sleep(1)
return False
def make_dir(path):
# 检查下载目录是否存在,如果不存在则创建
if not os.path.exists(path):
os.makedirs(path)
return False
return True
# coding: utf-8 # coding: utf-8
import os import os
import pandas as pd
import xlrd
from openpyxl.reader.excel import load_workbook
def open_xls(file_path):
# 开始处理excel数据
workbook = xlrd.open_workbook(filename=file_path)
# 选择工作表
return workbook.sheet_by_index(0) # 选择第一个工作表
def save_xls(data, output_file):
df = pd.DataFrame(data)
# 将 DataFrame 写入 Excel 文件
df.to_excel(output_file, index=False) # index=False 表示不写入行索引
# 使用 openpyxl 重新加载工作簿
wb = load_workbook(output_file)
ws = wb.active # 获取活动工作表
# 自动调整列宽
for column in ws.columns:
max_length = 0
# 获取列字母
column_letter = column[0].column_letter
for cell in column:
try:
if len(str(cell.value)) > max_length:
max_length = len(str(cell.value))
except:
pass
# 增加一些宽度以美观
adjusted_width = (max_length + 2)
ws.column_dimensions[column_letter].width = adjusted_width
# 保存调整后的工作簿
wb.save(output_file)
def make_dir(path): def make_dir(path):
# 检查下载目录是否存在,如果不存在则创建 # 检查下载目录是否存在,如果不存在则创建
......
# coding: utf-8 # coding: utf-8
# 回款明细 # 回款明细
import os import os
import time
import urllib.parse
import warnings
import pandas as pd import pandas as pd
from DrissionPage import ChromiumPage from DrissionPage import ChromiumPage
...@@ -18,23 +21,67 @@ helper.make_dir(download_path) ...@@ -18,23 +21,67 @@ helper.make_dir(download_path)
# 设置下载路径,确保在打开浏览器前设置 # 设置下载路径,确保在打开浏览器前设置
page.set.download_path(download_path) page.set.download_path(download_path)
# 忽略 openpyxl 样式警告
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")
# 对过滤后的数据,进一步处理 Description 列
def process_description(description):
# 按空格分割最后一段
parts = description.split('/')
# 检查分割后的最后一个部分的前面是否为8位数
if len(parts) > 0 and len(parts[-1]) >= 8:
# 返回分割后的前8位
return parts[-1][:8]
return None
def download_filter_data():
file_name = 'Payments.xlsx'
if not os.path.isfile(file_name):
page.get(f"https://vendorcentral.amazon.com/hz/vendor/members/remittance/home")
page.ele("#remittance-home-select-all").click()
page.ele("#remittance-home-export-link").click.to_download()
page.download.wait()
df = pd.read_excel('Payments.xlsx', skiprows=22)
# 定义正则表达式模式,匹配包含 'Price' 或 'PCR' 或 'XXXXXXXX/XXXX/' 的描述
pattern = r'Price|PCR|Missed|Shortage|^[A-Z0-9]{8}/[A-Z0-9]{4}/'
# 过滤符合条件的行
return df[df['Description'].str.contains(pattern, na=False, regex=True)]
def main(): def main():
po = "74HDTI2S" list_data = download_filter_data()
page.get( for _, data in list_data.iterrows():
f"https://vendorcentral.amazon.com/hz/vendor/members/inv-mgmt/invoice-details?invoiceNumber={po}&payeeCode=VECET&activeTab=lineItems") invoice_number = data.get("Invoice Number")
try: description = data.get("Description")
invoice_dir = "invoices" if "Price" in description or "PCR" in description or "Missed" in description or "Shortage" in description:
file_name = f"{invoice_dir}\\{po}.csv" # 获取前8位
mission = page.ele("#line-items-export-to-spreadsheet-announce").click.to_download(rename=file_name) invoice_number = invoice_number[:8]
mission.wait()
print(invoice_number)
page.get(
f"https://vendorcentral.amazon.com/hz/vendor/members/inv-mgmt/invoice-po-search?searchByNumberToken={invoice_number}")
params = {
"invoiceNumber": invoice_number,
"payeeCode": "VECET",
"activeTab": "lineItems",
}
# 将字典转换为 URL 查询参数
query_string = urllib.parse.urlencode(params)
page.get(f"https://vendorcentral.amazon.com/hz/vendor/members/inv-mgmt/invoice-details?" + query_string)
detail_data = pd.read_csv(file_name, skiprows=2) # 读取详情内容
for _, data, in detail_data.iterrows(): file_name = f"invoices\\{invoice_number}.csv"
print(f"{data.to_dict()}") page.ele("#line-items-export-to-spreadsheet-announce").click.to_download(rename=file_name)
time.sleep(3)
except Exception as err: detail_data = pd.read_csv(file_name, skiprows=2, engine='python', on_bad_lines='skip')
print(f"读取 CSV 文件时出错: {err}") # for _, detail, in detail_data.iterrows():
# print(f"{detail.to_dict()}")
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -4,19 +4,15 @@ import os ...@@ -4,19 +4,15 @@ import os
import time import time
import pandas as pd import pandas as pd
from DrissionPage import ChromiumPage, ChromiumOptions from DrissionPage import ChromiumPage
from DrissionPage.errors import PageDisconnectedError from DrissionPage.errors import PageDisconnectedError
from helper import helper from helper import helper, excel
email = None email = None
password = None password = None
# 创建配置对象(默认从 ini 文件中读取配置) page = ChromiumPage()
co = ChromiumOptions()
# 设置不加载图片、静音
co.no_imgs(True).mute(True)
page = ChromiumPage(addr_or_opts=co)
page.set.load_mode.eager() page.set.load_mode.eager()
page.set.when_download_file_exists('overwrite') page.set.when_download_file_exists('overwrite')
...@@ -84,7 +80,8 @@ def main(): ...@@ -84,7 +80,8 @@ def main():
# 追加数据 # 追加数据
new_list_data.append(data_dict) new_list_data.append(data_dict)
helper.save_xls(new_list_data, '退货明细.xlsx') excel.save_xls(new_list_data, '退货明细.xlsx')
page.close()
def asin_sku_relations(): def asin_sku_relations():
...@@ -103,8 +100,8 @@ def export_list(): ...@@ -103,8 +100,8 @@ def export_list():
open_url("https://vendorcentral.amazon.com/hz/vendor/members/returns?ref_=vc_xx_subNav") open_url("https://vendorcentral.amazon.com/hz/vendor/members/returns?ref_=vc_xx_subNav")
# 导出退货单 # 导出退货单
mission = page.ele("#file-download-button").click.to_download() page.ele("#file-download-button").click.to_download()
mission.wait() page.download.wait()
return pd.read_excel('Return_Summary.xls', engine='xlrd') return pd.read_excel('Return_Summary.xls', engine='xlrd')
...@@ -116,8 +113,8 @@ def export_item(return_id): ...@@ -116,8 +113,8 @@ def export_item(return_id):
if not os.path.isfile(file_name): if not os.path.isfile(file_name):
# 打开退回详情下载明细 # 打开退回详情下载明细
open_url(f"https://vendorcentral.amazon.com/katalmonsapp/vendor/members/returns/{return_id}") open_url(f"https://vendorcentral.amazon.com/katalmonsapp/vendor/members/returns/{return_id}")
mission = page.ele("#file-download-button").click.to_download(rename=file_name) page.ele("#file-download-button").click.to_download(rename=file_name)
mission.wait() page.download.wait()
# 读取回退商品详情 # 读取回退商品详情
return pd.read_excel(file_name, engine='xlrd') return pd.read_excel(file_name, engine='xlrd')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment