代码改变世界

PHP 提取PDF文件内容

2023-05-29 16:11  天心PHP  阅读(494)  评论(0编辑  收藏  举报

这里以提取 亚马逊日期范围报告PDF汇总 的数据

根据路径下载PDF

 /**
     * description: 文件下载
     * @throws CException
     */
    public function getFile($url, $save_dir = '', $filename = '', $type = 0)
    {
        if (trim($url) == '') {
            return false;
        }
        if (trim($save_dir) == '') {
            $save_dir = './';
        }
        if (0 !== strrpos($save_dir, '/')) {
            $save_dir .= '/';
        }
        //创建保存目录
        if (!file_exists($save_dir) && !mkdir($save_dir, 0777, true)) {
            return false;
        }
        //获取远程文件所采用的方法
        if ($type) {
            $ch = curl_init();
            $timeout = 5;
            curl_setopt($ch, CURLOPT_URL, $url);
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
            curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
            $content = curl_exec($ch);
            curl_close($ch);
        } else {
            ob_start();
            readfile($url);
            $content = ob_get_contents();
            ob_end_clean();
        }
        //echo $content;
        $size = strlen($content);
        //文件大小
        $fp2 = @fopen($save_dir . $filename, 'a');
        @fwrite($fp2, $content);
        @fclose($fp2);
        unset($content, $url);
        return array(
            'status' => 1,
            'file_name' => $filename,
            'save_path' => $save_dir . $filename,
            'file_size' => $size
        );
    }

服务器需要开启 shell_exec

shell_exec("pdftotext -layout GAN-IT_242_510181.pdf  GAN-IT_242_510181.txt");

得到按行解析的txt

 在就提取txt文件的 11行到16行数据

public function downpdfredislisting($val){
        ini_set("display_errors", "On");
        error_reporting(E_ALL | E_STRICT);
        $modelre = new AmazonZnReport();
        $znmodel = new AmazonReportZnInfo();
        $modelpdf = YbModel::model('AmazonReportZnPdf');
        $url = Yii::getPathOfAlias('webroot') . '/upload/pdflabel/';
        $accountnamelist = YbModel::model('AmazonAccount')->queryPairs('id,account_name','status=1');
        $infoaccount = YbModel::model('AmazonAccount')->findByPk($val['accountid']);
        if(!isset($accountnamelist[$val['accountid']])){
            $znmodel->updateAll(['is_down' => 6, 'update_at' => date("Y-m-d H:i:s")], "id='{$val['id']}'");
            return true;
        }
        $filename = $accountnamelist[$val['accountid']].'_'.$val['accountid'].'_'.$val['planid'];
        if (file_exists($url .$filename. '.pdf')) {
            @unlink($url . $filename . '.pdf');
        }
        $res = $modelre->getFile($val['url'], $url, $filename.'.pdf');//下载pdf
        $modelpdf->deleteAll('account_id=:account_id and batchnumber=:bn', [':account_id' => $val['accountid'], ':bn' => $val['batchnumber']]);
        shell_exec("pdftotext -layout ".$url.$filename.".pdf  ".$url.$filename.".txt");
        if(!file_exists($url.$filename.".txt")){return false;}
        $sum = 0;
        $content = $modelre->readTXT($url.$filename.".txt");
        $dlist = $list = $data =[];
        foreach ($content as $keyp=>$valp){
            if( ($keyp>=10 && $keyp<=15 && trim($valp)) || ($infoaccount->site=='uk' && $keyp==50) ){
                $res = preg_replace("/\s{2,}/u","_",trim($valp));
                $reslist = explode('_',$res);
                if($keyp==50){
                    $reslist[0] = $reslist[2];
                    $reslist[1] = '';
                    $reslist[2] = $reslist[3];
                    unset($reslist[3]);
                }
                if(count($reslist)==1){
                    continue;
                }
                $result = strpos($reslist[2], '.');//
                $result1= strpos($reslist[2], ',');//
                $sub = 0;
                if($result !== false && $result1 !==false) {//存在两个字符
                    if($result<$result1){
                        $sub = str_replace('.', '', $reslist[2]);
                        $sub = str_replace(',', '.', $sub);
                    }else{
                        $sub = str_replace(',', '', $reslist[2]);
                    }
                }else{
                    $sublist = [];
                    if($result !== false){
                        $sublist = explode('.',$reslist[2]);
                    }
                    if($result1 !== false){
                        $sublist = explode(',',$reslist[2]);
                    }
                    if(isset($sublist[1]) && strlen($sublist[1])==2){
                        $sub = str_replace(',', '.', $reslist[2]);
                    }else{
                        $sub = str_replace('.', '', $reslist[2]);
                        $sub = str_replace(',', '', $sub);
                    }
                }
                $sum += (double)$sub;
                $dlist[] = $reslist;
            }
        }
        $data['account_id'] = $val['accountid'];
        $data['account_name'] = $accountnamelist[$val['accountid']];
        $data['description'] = json_encode($dlist);
        $data['total_price'] = $sum;
        $data['url'] = '/upload/pdflabel/'.$filename.'.pdf';
        $data['batchnumber'] = $val['batchnumber'];
        $data['create_time'] = date('Y-m-d H:i:s');
        $list[] = $data;
        $modelpdf->batchReplaceAll("{{amazon_report_zn_pdf}}", array_keys($list[0]), $list);
        $znmodel->updateAll(['is_down' => 1, 'update_at' => date("Y-m-d H:i:s")], "id='{$val['id']}'");
        @unlink($url . $filename . '.txt');
        return true;
    }

 

/**
 * description: 读取txt
 * @throws CException
 */
public function readTXT($filePath)
{
    $open = fopen($filePath, 'r');
    while (feof($open) === false) {
       yield str_replace(PHP_EOL, '', fgets($open));
   }
   fclose($open);
}

 

 得到数据