1. 背景
??????? 針對下面的文件data.csv,首先根據(jù)fid進(jìn)行排序,然后分組,使相同fid的記錄放到同一個excel文件中,并對每列重復(fù)的數(shù)據(jù)元素染上紅色。
fid,user_id
-1000078398032092029,230410010036537520
-1000078398032092029,230423010026993942
-1000078398032092029,230505010027684603
-101241766345369238,210911010005526495
-101241766345369238,211017010017923011
-101241766345369238,230113010029633164
-101241766345369238,230514010028256452
-101241766345369238,230518010036813773
-1045165137456710,220401010038956742
-1045165137456710,220401010038956742
-1050918014514687463,210805010001898014
-1050918014514687463,210805010001898014
-111(手動添加一個結(jié)束標(biāo)志)
2. 分組切割文件
import pandas as pd
pd.set_option('display.max_rows', None)
# 根據(jù)fid對文件進(jìn)行分割,每個fid一個文件
def split_df_by_fid():
df = pd.read_csv('data.csv', dtype=str)
row_split_list = [] # 記錄分割點索引
current_fid = '-1000078398032092029' # 第一個fid
start = 0
end = start
cnt = 1 # 記錄當(dāng)前是第幾個分割子文件
for fid in df['fid']:
if fid != current_fid:
row_split_list.append((start, end))
# 當(dāng)前fid組寫入一個新文件
df[start:end].to_csv('data_split/' + str(cnt) + '.csv', index=0)
cnt += 1
current_fid = fid
start = end
end += 1
print("總文件數(shù): ", len(row_split_list))
print(row_split_list)
# [(0, 3), (3, 8), (8, 10), (10, 12)]
輸出
3. 染色-寫入excel
1. 找到同列重復(fù)元素
def group_by_find_duplicate_values(group_df: pd.DataFrame, col: str) -> list:
value_counts = group_df[col].value_counts().reset_index()
return value_counts[value_counts[col] > 1]['index'].to_list()
2. 插入圖片
def inset_a_img(row_index, col_index, img_name):
image_path = os.path.join("data_img/", img_name.replace('/', '_'))
h, w, *_ = cv2.imread(image_path).shape
scale = CEIL_HEIGHT * 1.3 / h
SHEET.insert_image(row_index, col_index, image_path, # x_offset可調(diào)整x軸圖片偏移
{'x_offset': 0, 'y_offset': 0, 'x_scale': scale, 'y_scale': scale, 'positioning': 1})
3. 同列相同元素染色
# -*- coding: utf-8 -*-
import os
import cv2
import tqdm
import pandas as pd
import xlsxwriter
CEIL_HEIGHT = 156
def write_color():
df = pd.read_csv(input_file, dtype=str)
for i, col in enumerate(df.columns):
SHEET.write(0, i, col) # 第0行第i列插入表頭字段
try:
for i, line in tqdm.tqdm(enumerate(df.itertuples())): # tqdm: 顯示進(jìn)度條
temp_df = df[df['fid'] == line.fid]
for j, col in enumerate(df.columns):
duplicate_values = group_by_find_duplicate_values(temp_df, col)
content = str(df.iloc[i, j])
# 染色
cell_format = BOOK.add_format({'font_color': 'red' if (content in duplicate_values and j > 0) else 'black'})
if col not in ('face_path', 'ocr_path'):
# SHEET.write(*(i + 1, j), content, cell_format)
SHEET.write(i+1, j, content, cell_format)
else:
if col == 'face_path' and not pd.isna(line.face_path):
inset_a_img(i+1, j, line.face_path) # 為Nan的置空,不寫入圖片
if col == 'ocr_path' and not pd.isna(line.ocr_path):
inset_a_img(i+1, j+1-1, line.ocr_path)
BOOK.close()
except Exception as e:
print(e)
if __name__ == '__main__':
for file in os.listdir('data_split'):
input_file = 'data_split/' + file
output_file = 'data_split_xlsx/' + file.replace('csv', 'xlsx')
BOOK = xlsxwriter.Workbook(output_file)
SHEET = BOOK.add_worksheet('sheet1')
SHEET.set_default_row(CEIL_HEIGHT)
SHEET.set_column(0, 60, 25) # 分別為要修改的起始列,終止列,設(shè)置的列寬
write_color()
輸出
文章來源:http://www.zghlxwxcb.cn/news/detail-613896.html
文章來源地址http://www.zghlxwxcb.cn/news/detail-613896.html
到了這里,關(guān)于文件按關(guān)鍵字分組-切割-染色-寫入excel的文章就介紹完了。如果您還想了解更多內(nèi)容,請在右上角搜索TOY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!