python写一些脚本
1 import json 2 import pandas as pd 3 import re 4 import requests 5 from PIL import Image 6 from io import BytesIO 7 from openpyxl import load_workbook 8 from openpyxl.drawing.image import Image as OpenpyxlImage 9 import os 10 11 12 def read_json_and_save_to_excel(): 13 # 假设你的文件路径是当前目录下的data.json 14 file_path = 'data.json' 15 # 定义输出的Excel文件名 16 output_excel_file = 'image_urls.xlsx' 17 18 # 打开文件并读取其内容 19 with open(file_path, 'r', encoding='utf-8') as file: 20 # 解析JSON数据 21 data = json.load(file) 22 23 # 获取实际的数据列表 24 real_data = data['data'] 25 26 # 创建一个空列表来存储imageUrl 27 image_urls = [] 28 29 # 遍历列表并收集所有的imageUrl 30 for item in real_data: 31 # 检查是否存在imageUrl键 32 if 'imageUrl' in item: 33 image_urls.append(item['imageUrl']) 34 elif 'liveData' in item and 'imageUrl' in item['liveData']: 35 image_urls.append(item['liveData']['imageUrl']) 36 else: 37 image_urls.append(None) # 如果没有imageUrl,则添加None 38 39 # 将imageUrl列表转换为pandas DataFrame 40 df = pd.DataFrame(image_urls, columns=['originImageUrl']) 41 42 # 将DataFrame保存到Excel文件 43 df.to_excel(output_excel_file, index=False) 44 45 print(f"Image URLs have been saved to {output_excel_file}") 46 47 48 def read_webp_and_save_to_excel(): 49 # 假设你的文件路径是当前目录下的data.txt 50 file_path = 'webp.txt' 51 52 # 准备正则表达式,用于匹配URL 53 url_pattern = re.compile(r'https?://[^\s]+') 54 55 # 读取文本文件并提取URL到列表中 56 url_list = [] 57 # 打开文件并逐行读取内容 58 with open(file_path, 'r', encoding='utf-8') as file: 59 # 逐行读取文件 60 for line in file: 61 # 使用正则表达式查找URL 62 match = url_pattern.search(line) 63 if match: 64 # 提取URL 65 url_list.append(match.group()) 66 67 # 读取Excel文件 68 excel_path = 'image_urls.xlsx' 69 df = pd.read_excel(excel_path) 70 71 # 创建一个新列webpImageUrl,初始值为None 72 df['webpImageUrl'] = None 73 74 # 遍历originImageUrl列,匹配并填充webpImageUrl列 75 for index, row in df.iterrows(): 76 origin_url = row['originImageUrl'] 77 for url in url_list: 78 if url.startswith(origin_url): 79 df.at[index, 'webpImageUrl'] = url 80 break # 找到匹配项后跳出循环 81 82 # 将更新后的DataFrame保存到新的Excel文件中 83 df.to_excel('webp_image_urls.xlsx', index=False) # 替换为你想要保存的文件名 84 85 86 def read_heif_and_save_to_excel(): 87 # 假设你的文件路径是当前目录下的data.txt 88 file_path = 'heif.txt' 89 90 # 准备正则表达式,用于匹配URL 91 url_pattern = re.compile(r'https?://[^\s]+') 92 93 # 读取文本文件并提取URL到列表中 94 url_list = [] 95 # 打开文件并逐行读取内容 96 with open(file_path, 'r', encoding='utf-8') as file: 97 # 逐行读取文件 98 for line in file: 99 # 使用正则表达式查找URL 100 match = url_pattern.search(line) 101 if match: 102 # 提取URL 103 url_list.append(match.group()) 104 105 # 读取Excel文件 106 excel_path = 'webp_image_urls.xlsx' 107 df = pd.read_excel(excel_path) 108 109 # 创建一个新列webpImageUrl,初始值为None 110 df['heifImageUrl'] = None 111 112 # 遍历originImageUrl列,匹配并填充heifImageUrl列 113 for index, row in df.iterrows(): 114 origin_url = row['originImageUrl'] 115 for url in url_list: 116 if url.startswith(origin_url): 117 df.at[index, 'heifImageUrl'] = url 118 break # 找到匹配项后跳出循环 119 120 # 将更新后的DataFrame保存到新的Excel文件中 121 df.to_excel('heif_image_urls.xlsx', index=False) # 替换为你想要保存的文件名 122 123 124 # 定义一个函数来获取URL对应的文件大小 125 def get_file_size(url): 126 try: 127 response = requests.head(url, allow_redirects=True, timeout=10) 128 size = response.headers.get('content-length', 0) # 获取文件大小 129 return round(int(size) / 1024.0, 2) 130 except Exception as e: 131 print(f"Error getting file size for {url}: {e}") 132 return 0 133 134 135 def save_image_file_size_to_excel(): 136 # 读取Excel文件 137 excel_path = 'android_final_image_info_with_thumbnails.xlsx' # 替换为你的Excel文件路径 138 df = pd.read_excel(excel_path) 139 140 if df['originImageSize'].dtype != 'float': 141 df['originImageSize'] = df['originImageSize'].astype(float) 142 if df['webpImageSize'].dtype != 'float': 143 df['webpImageSize'] = df['webpImageSize'].astype(float) 144 if df['heifImageSize'].dtype != 'float': 145 df['heifImageSize'] = df['heifImageSize'].astype(float) 146 # # 创建三个个新列来存储文件大小 147 # df['originImageSize'] = None 148 # df['webpImageSize'] = None 149 # df['heifImageSize'] = None 150 151 # 遍历DataFrame,获取每个URL对应的文件大小 152 for index, row in df.iterrows(): 153 if pd.notnull(row['originImageUrl']): 154 df.at[index, 'originImageSize'] = get_file_size(row['originImageUrl']) 155 if pd.notnull(row['webpImageUrl']): 156 df.at[index, 'webpImageSize'] = get_file_size(row['webpImageUrl']) 157 if pd.notnull(row['heifImageUrl']): 158 df.at[index, 'heifImageSize'] = get_file_size(row['heifImageUrl']) 159 160 # 将更新后的DataFrame保存到新的Excel文件中 161 df.to_excel('android_final_image_info_with_thumbnails-1.xlsx', index=False) # 替换为你想要保存的文件名 162 163 164 # 定义下载图片并创建缩略图的函数 165 def download_and_resize_image(url, size=(128, 128)): 166 try: 167 response = requests.get(url) 168 response.raise_for_status() # 确保请求成功 169 img = Image.open(BytesIO(response.content)) 170 img.thumbnail(size) # 创建缩略图 171 172 # 如果图像是CMYK模式,则转换为RGB 173 if img.mode == 'CMYK': 174 img = img.convert('RGB') 175 176 return img 177 except Exception as e: 178 print(f"Error downloading or resizing image {url}: {e}") 179 return None 180 181 182 def save_thumbnails_to_excel(): 183 # 读取Excel文件 184 excel_path = 'android_final_image_info_with_thumbnails-1.xlsx' # 替换为你的Excel文件路径 185 wb = load_workbook(excel_path) 186 ws = wb.active 187 188 # 定义图片URL列和对应的Excel列 189 image_columns = { 190 'A': 'originImageUrl' 191 } 192 193 # 缩略图将存储在原始图片URL列的下一列 194 thumbnail_columns = { 195 'G': 'originImageThumbnail' 196 } 197 198 # 遍历每个图片URL列 199 for original_col, thumbnail_col in zip(image_columns.keys(), thumbnail_columns.keys()): 200 # 从第二行开始处理(假设第一行是标题行) 201 for index, cell in enumerate(ws[original_col][1:], start=2): # 从第二行开始 202 print(f'开始处理第{index}条') 203 image_url = cell.value 204 if image_url: 205 img = download_and_resize_image(image_url) 206 if img: 207 # 将PIL图像保存到字节流中 208 img_byte_array = BytesIO() 209 img.save(img_byte_array, format='PNG') 210 img_byte_array.seek(0) 211 212 # 创建openpyxl的Image对象 213 openpyxl_img = OpenpyxlImage(img_byte_array) 214 215 # 将图片插入到指定单元格 216 cell_ref = f'{thumbnail_col}{index}' 217 ws.add_image(openpyxl_img, cell_ref) 218 219 # 保存更新后的Excel文件 220 wb.save('android_final_image_info_with_thumbnails-2.xlsx') 221 222 223 def process_images(): 224 # 读取json文件 225 with open('200image.json', 'r') as f: 226 data = json.load(f) 227 228 image_data = data['data'] 229 230 # 读取excel文件 231 wb = load_workbook('rec_image.xlsx') 232 sheet = wb.active 233 234 # 创建一个字典,key为A列的url,value为B列的url 235 url_dict = {sheet.cell(row=i, column=1).value.split('@')[0]: sheet.cell(row=i, column=6).value for i in range(1, sheet.max_row+1)} 236 237 # 删除不存在imageUrl的项 238 # data['data'] = [item for item in data['data'] if 'imageUrl' in item] 239 240 # 遍历json数组 241 i = 0 242 while i < len(image_data): 243 if 'feedbacks' in image_data[i]: 244 del image_data[i]['feedbacks'] 245 if 'mge' in image_data[i]: 246 del image_data[i]['mge'] 247 if 'mgeExtras' in image_data[i]: 248 del image_data[i]['mgeExtras'] 249 if i % 2 == 0: 250 if 'imageUrl' in image_data[i]: 251 if image_data[i]['imageUrl'] is not None: 252 tempUrl1 = image_data[i]['imageUrl'].split('@')[0] 253 if tempUrl1 in url_dict: 254 tempUrl2 = url_dict[tempUrl1] 255 if i+1 < len(image_data): 256 image_data[i+1]['imageUrl'] = tempUrl2 257 i += 1 258 else: 259 print(i , "imageUrl is 不存在") 260 if i+1 < len(image_data): 261 del image_data[i+1] 262 del image_data[i] 263 264 else: 265 print(i , "imageUrl is None") 266 i += 1 267 else: 268 if i+1 < len(image_data): 269 del image_data[i+1] 270 del image_data[i] 271 else: 272 i += 1 273 274 # 保留100条数据 275 data['data'] = data['data'][200:] 276 277 # 生成新的json文件 278 with open('resF2.json', 'w') as f: 279 json.dump(data, f, ensure_ascii=False, indent=4) 280 281 282 def read_excel_and_save_to_json(): 283 # 读取Excel文件,不使用第一行作为列名 284 df = pd.read_excel('rec_image.xlsx', header=None) 285 print(df.columns) 286 287 # 提取第1、2、4、6列的数据 288 df = df[[0, 1, 3, 5]] 289 290 # 将DataFrame转换为字典列表 291 data_list = df.to_dict('records') 292 293 # 将字典列表保存到JSON文件 294 with open('output.json', 'w', encoding='utf-8') as f: 295 json.dump(data_list, f, ensure_ascii=False, indent=4) 296 297 298 if __name__ == '__main__': 299 read_excel_and_save_to_json()
知识在于点滴积累