PHP 提取PDF文件内容
2023-05-29 16:11 天心PHP 阅读(487) 评论(0) 编辑 收藏 举报这里以提取 亚马逊日期范围报告PDF汇总 的数据
根据路径下载PDF
/** * description: 文件下载 * @throws CException */ public function getFile($url, $save_dir = '', $filename = '', $type = 0) { if (trim($url) == '') { return false; } if (trim($save_dir) == '') { $save_dir = './'; } if (0 !== strrpos($save_dir, '/')) { $save_dir .= '/'; } //创建保存目录 if (!file_exists($save_dir) && !mkdir($save_dir, 0777, true)) { return false; } //获取远程文件所采用的方法 if ($type) { $ch = curl_init(); $timeout = 5; curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); $content = curl_exec($ch); curl_close($ch); } else { ob_start(); readfile($url); $content = ob_get_contents(); ob_end_clean(); } //echo $content; $size = strlen($content); //文件大小 $fp2 = @fopen($save_dir . $filename, 'a'); @fwrite($fp2, $content); @fclose($fp2); unset($content, $url); return array( 'status' => 1, 'file_name' => $filename, 'save_path' => $save_dir . $filename, 'file_size' => $size ); }
服务器需要开启 shell_exec
shell_exec("pdftotext -layout GAN-IT_242_510181.pdf GAN-IT_242_510181.txt");
得到按行解析的txt
在就提取txt文件的 11行到16行数据
public function downpdfredislisting($val){ ini_set("display_errors", "On"); error_reporting(E_ALL | E_STRICT); $modelre = new AmazonZnReport(); $znmodel = new AmazonReportZnInfo(); $modelpdf = YbModel::model('AmazonReportZnPdf'); $url = Yii::getPathOfAlias('webroot') . '/upload/pdflabel/'; $accountnamelist = YbModel::model('AmazonAccount')->queryPairs('id,account_name','status=1'); $infoaccount = YbModel::model('AmazonAccount')->findByPk($val['accountid']); if(!isset($accountnamelist[$val['accountid']])){ $znmodel->updateAll(['is_down' => 6, 'update_at' => date("Y-m-d H:i:s")], "id='{$val['id']}'"); return true; } $filename = $accountnamelist[$val['accountid']].'_'.$val['accountid'].'_'.$val['planid']; if (file_exists($url .$filename. '.pdf')) { @unlink($url . $filename . '.pdf'); } $res = $modelre->getFile($val['url'], $url, $filename.'.pdf');//下载pdf $modelpdf->deleteAll('account_id=:account_id and batchnumber=:bn', [':account_id' => $val['accountid'], ':bn' => $val['batchnumber']]); shell_exec("pdftotext -layout ".$url.$filename.".pdf ".$url.$filename.".txt"); if(!file_exists($url.$filename.".txt")){return false;} $sum = 0; $content = $modelre->readTXT($url.$filename.".txt"); $dlist = $list = $data =[]; foreach ($content as $keyp=>$valp){ if( ($keyp>=10 && $keyp<=15 && trim($valp)) || ($infoaccount->site=='uk' && $keyp==50) ){ $res = preg_replace("/\s{2,}/u","_",trim($valp)); $reslist = explode('_',$res); if($keyp==50){ $reslist[0] = $reslist[2]; $reslist[1] = ''; $reslist[2] = $reslist[3]; unset($reslist[3]); } if(count($reslist)==1){ continue; } $result = strpos($reslist[2], '.');//点 $result1= strpos($reslist[2], ',');//豆 $sub = 0; if($result !== false && $result1 !==false) {//存在两个字符 if($result<$result1){ $sub = str_replace('.', '', $reslist[2]); $sub = str_replace(',', '.', $sub); }else{ $sub = str_replace(',', '', $reslist[2]); } }else{ $sublist = []; if($result !== false){ $sublist = explode('.',$reslist[2]); } if($result1 !== false){ $sublist = explode(',',$reslist[2]); } if(isset($sublist[1]) && strlen($sublist[1])==2){ $sub = str_replace(',', '.', $reslist[2]); }else{ $sub = str_replace('.', '', $reslist[2]); $sub = str_replace(',', '', $sub); } } $sum += (double)$sub; $dlist[] = $reslist; } } $data['account_id'] = $val['accountid']; $data['account_name'] = $accountnamelist[$val['accountid']]; $data['description'] = json_encode($dlist); $data['total_price'] = $sum; $data['url'] = '/upload/pdflabel/'.$filename.'.pdf'; $data['batchnumber'] = $val['batchnumber']; $data['create_time'] = date('Y-m-d H:i:s'); $list[] = $data; $modelpdf->batchReplaceAll("{{amazon_report_zn_pdf}}", array_keys($list[0]), $list); $znmodel->updateAll(['is_down' => 1, 'update_at' => date("Y-m-d H:i:s")], "id='{$val['id']}'"); @unlink($url . $filename . '.txt'); return true; }
/** * description: 读取txt * @throws CException */ public function readTXT($filePath) { $open = fopen($filePath, 'r'); while (feof($open) === false) { yield str_replace(PHP_EOL, '', fgets($open)); } fclose($open); }
得到数据