python写一些脚本

  1 import json
  2 import pandas as pd
  3 import re
  4 import requests
  5 from PIL import Image
  6 from io import BytesIO
  7 from openpyxl import load_workbook
  8 from openpyxl.drawing.image import Image as OpenpyxlImage
  9 import os
 10 
 11 
 12 def read_json_and_save_to_excel():
 13 # 假设你的文件路径是当前目录下的data.json
 14 file_path = 'data.json'
 15 # 定义输出的Excel文件名
 16 output_excel_file = 'image_urls.xlsx'
 17 
 18 # 打开文件并读取其内容
 19 with open(file_path, 'r', encoding='utf-8') as file:
 20 # 解析JSON数据
 21 data = json.load(file)
 22 
 23 # 获取实际的数据列表
 24 real_data = data['data']
 25 
 26 # 创建一个空列表来存储imageUrl
 27 image_urls = []
 28 
 29 # 遍历列表并收集所有的imageUrl
 30 for item in real_data:
 31 # 检查是否存在imageUrl键
 32 if 'imageUrl' in item:
 33 image_urls.append(item['imageUrl'])
 34 elif 'liveData' in item and 'imageUrl' in item['liveData']:
 35 image_urls.append(item['liveData']['imageUrl'])
 36 else:
 37 image_urls.append(None) # 如果没有imageUrl，则添加None
 38 
 39 # 将imageUrl列表转换为pandas DataFrame
 40 df = pd.DataFrame(image_urls, columns=['originImageUrl'])
 41 
 42 # 将DataFrame保存到Excel文件
 43 df.to_excel(output_excel_file, index=False)
 44 
 45 print(f"Image URLs have been saved to {output_excel_file}")
 46 
 47 
 48 def read_webp_and_save_to_excel():
 49 # 假设你的文件路径是当前目录下的data.txt
 50 file_path = 'webp.txt'
 51 
 52 # 准备正则表达式，用于匹配URL
 53 url_pattern = re.compile(r'https?://[^\s]+')
 54 
 55 # 读取文本文件并提取URL到列表中
 56 url_list = []
 57 # 打开文件并逐行读取内容
 58 with open(file_path, 'r', encoding='utf-8') as file:
 59 # 逐行读取文件
 60 for line in file:
 61 # 使用正则表达式查找URL
 62 match = url_pattern.search(line)
 63 if match:
 64 # 提取URL
 65 url_list.append(match.group())
 66 
 67 # 读取Excel文件
 68 excel_path = 'image_urls.xlsx'
 69 df = pd.read_excel(excel_path)
 70 
 71 # 创建一个新列webpImageUrl，初始值为None
 72 df['webpImageUrl'] = None
 73 
 74 # 遍历originImageUrl列，匹配并填充webpImageUrl列
 75 for index, row in df.iterrows():
 76 origin_url = row['originImageUrl']
 77 for url in url_list:
 78 if url.startswith(origin_url):
 79 df.at[index, 'webpImageUrl'] = url
 80 break # 找到匹配项后跳出循环
 81 
 82 # 将更新后的DataFrame保存到新的Excel文件中
 83 df.to_excel('webp_image_urls.xlsx', index=False) # 替换为你想要保存的文件名
 84 
 85 
 86 def read_heif_and_save_to_excel():
 87 # 假设你的文件路径是当前目录下的data.txt
 88 file_path = 'heif.txt'
 89 
 90 # 准备正则表达式，用于匹配URL
 91 url_pattern = re.compile(r'https?://[^\s]+')
 92 
 93 # 读取文本文件并提取URL到列表中
 94 url_list = []
 95 # 打开文件并逐行读取内容
 96 with open(file_path, 'r', encoding='utf-8') as file:
 97 # 逐行读取文件
 98 for line in file:
 99 # 使用正则表达式查找URL
100 match = url_pattern.search(line)
101 if match:
102 # 提取URL
103 url_list.append(match.group())
104 
105 # 读取Excel文件
106 excel_path = 'webp_image_urls.xlsx'
107 df = pd.read_excel(excel_path)
108 
109 # 创建一个新列webpImageUrl，初始值为None
110 df['heifImageUrl'] = None
111 
112 # 遍历originImageUrl列，匹配并填充heifImageUrl列
113 for index, row in df.iterrows():
114 origin_url = row['originImageUrl']
115 for url in url_list:
116 if url.startswith(origin_url):
117 df.at[index, 'heifImageUrl'] = url
118 break # 找到匹配项后跳出循环
119 
120 # 将更新后的DataFrame保存到新的Excel文件中
121 df.to_excel('heif_image_urls.xlsx', index=False) # 替换为你想要保存的文件名
122 
123 
124 # 定义一个函数来获取URL对应的文件大小
125 def get_file_size(url):
126 try:
127 response = requests.head(url, allow_redirects=True, timeout=10)
128 size = response.headers.get('content-length', 0) # 获取文件大小
129 return round(int(size) / 1024.0, 2)
130 except Exception as e:
131 print(f"Error getting file size for {url}: {e}")
132 return 0
133 
134 
135 def save_image_file_size_to_excel():
136 # 读取Excel文件
137 excel_path = 'android_final_image_info_with_thumbnails.xlsx' # 替换为你的Excel文件路径
138 df = pd.read_excel(excel_path)
139 
140 if df['originImageSize'].dtype != 'float':
141 df['originImageSize'] = df['originImageSize'].astype(float)
142 if df['webpImageSize'].dtype != 'float':
143 df['webpImageSize'] = df['webpImageSize'].astype(float)
144 if df['heifImageSize'].dtype != 'float':
145 df['heifImageSize'] = df['heifImageSize'].astype(float)
146 # # 创建三个个新列来存储文件大小
147 # df['originImageSize'] = None
148 # df['webpImageSize'] = None
149 # df['heifImageSize'] = None
150 
151 # 遍历DataFrame，获取每个URL对应的文件大小
152 for index, row in df.iterrows():
153 if pd.notnull(row['originImageUrl']):
154 df.at[index, 'originImageSize'] = get_file_size(row['originImageUrl'])
155 if pd.notnull(row['webpImageUrl']):
156 df.at[index, 'webpImageSize'] = get_file_size(row['webpImageUrl'])
157 if pd.notnull(row['heifImageUrl']):
158 df.at[index, 'heifImageSize'] = get_file_size(row['heifImageUrl'])
159 
160 # 将更新后的DataFrame保存到新的Excel文件中
161 df.to_excel('android_final_image_info_with_thumbnails-1.xlsx', index=False) # 替换为你想要保存的文件名
162 
163 
164 # 定义下载图片并创建缩略图的函数
165 def download_and_resize_image(url, size=(128, 128)):
166 try:
167 response = requests.get(url)
168 response.raise_for_status() # 确保请求成功
169 img = Image.open(BytesIO(response.content))
170 img.thumbnail(size) # 创建缩略图
171 
172 # 如果图像是CMYK模式，则转换为RGB
173 if img.mode == 'CMYK':
174 img = img.convert('RGB')
175 
176 return img
177 except Exception as e:
178 print(f"Error downloading or resizing image {url}: {e}")
179 return None
180 
181 
182 def save_thumbnails_to_excel():
183 # 读取Excel文件
184 excel_path = 'android_final_image_info_with_thumbnails-1.xlsx' # 替换为你的Excel文件路径
185 wb = load_workbook(excel_path)
186 ws = wb.active
187 
188 # 定义图片URL列和对应的Excel列
189 image_columns = {
190 'A': 'originImageUrl'
191 }
192 
193 # 缩略图将存储在原始图片URL列的下一列
194 thumbnail_columns = {
195 'G': 'originImageThumbnail'
196 }
197 
198 # 遍历每个图片URL列
199 for original_col, thumbnail_col in zip(image_columns.keys(), thumbnail_columns.keys()):
200 # 从第二行开始处理（假设第一行是标题行）
201 for index, cell in enumerate(ws[original_col][1:], start=2): # 从第二行开始
202 print(f'开始处理第{index}条')
203 image_url = cell.value
204 if image_url:
205 img = download_and_resize_image(image_url)
206 if img:
207 # 将PIL图像保存到字节流中
208 img_byte_array = BytesIO()
209 img.save(img_byte_array, format='PNG')
210 img_byte_array.seek(0)
211 
212 # 创建openpyxl的Image对象
213 openpyxl_img = OpenpyxlImage(img_byte_array)
214 
215 # 将图片插入到指定单元格
216 cell_ref = f'{thumbnail_col}{index}'
217 ws.add_image(openpyxl_img, cell_ref)
218 
219 # 保存更新后的Excel文件
220 wb.save('android_final_image_info_with_thumbnails-2.xlsx')
221 
222 
223 def process_images():
224 # 读取json文件
225 with open('200image.json', 'r') as f:
226 data = json.load(f)
227  
228 image_data = data['data']
229 
230 # 读取excel文件
231 wb = load_workbook('rec_image.xlsx')
232 sheet = wb.active
233 
234 # 创建一个字典，key为A列的url，value为B列的url
235 url_dict = {sheet.cell(row=i, column=1).value.split('@')[0]: sheet.cell(row=i, column=6).value for i in range(1, sheet.max_row+1)}
236 
237 # 删除不存在imageUrl的项
238 # data['data'] = [item for item in data['data'] if 'imageUrl' in item]
239 
240 # 遍历json数组
241 i = 0
242 while i < len(image_data):
243 if 'feedbacks' in image_data[i]:
244 del image_data[i]['feedbacks']
245 if 'mge' in image_data[i]:
246 del image_data[i]['mge']
247 if 'mgeExtras' in image_data[i]:
248 del image_data[i]['mgeExtras']
249 if i % 2 == 0:
250 if 'imageUrl' in image_data[i]:
251 if image_data[i]['imageUrl'] is not None:
252 tempUrl1 = image_data[i]['imageUrl'].split('@')[0]
253 if tempUrl1 in url_dict:
254 tempUrl2 = url_dict[tempUrl1]
255 if i+1 < len(image_data):
256 image_data[i+1]['imageUrl'] = tempUrl2
257 i += 1
258 else:
259 print(i , "imageUrl is 不存在")
260 if i+1 < len(image_data):
261 del image_data[i+1]
262 del image_data[i]
263  
264 else:
265 print(i , "imageUrl is None")
266 i += 1
267 else:
268 if i+1 < len(image_data):
269 del image_data[i+1]
270 del image_data[i]
271 else:
272 i += 1
273  
274 # 保留100条数据
275 data['data'] = data['data'][200:]
276 
277 # 生成新的json文件
278 with open('resF2.json', 'w') as f:
279 json.dump(data, f, ensure_ascii=False, indent=4)
280 
281 
282 def read_excel_and_save_to_json():
283 # 读取Excel文件，不使用第一行作为列名
284 df = pd.read_excel('rec_image.xlsx', header=None)
285 print(df.columns)
286 
287 # 提取第1、2、4、6列的数据
288 df = df[[0, 1, 3, 5]]
289 
290 # 将DataFrame转换为字典列表
291 data_list = df.to_dict('records')
292 
293 # 将字典列表保存到JSON文件
294 with open('output.json', 'w', encoding='utf-8') as f:
295 json.dump(data_list, f, ensure_ascii=False, indent=4)
296 
297 
298 if __name__ == '__main__':
299 read_excel_and_save_to_json()
posted @ 2024-04-17 11:25 谢勇飞~ 阅读(16) 评论(0) 编辑收藏举报
刷新页面返回顶部
谢勇飞~

知识在于点滴积累

python写一些脚本

公告