怎样提取pdf表格的内容（关于PDF文件表格提取实现）

pdf文件内容格式比较特殊，目前根据实际观察发现pdf内容只是把源目标文件，一行一行读到pdf中，通过定位方式实现同版面展示，所以表格在pdf中表现形式比较特殊，接下来我们就来聊聊关于怎样提取pdf表格的内容?以下内容大家不妨参考一二希望能帮到您!

怎样提取pdf表格的内容

pdf文件内容格式比较特殊，目前根据实际观察发现pdf内容只是把源目标文件，一行一行读到pdf中，通过定位方式实现同版面展示，所以表格在pdf中表现形式比较特殊。

实现思路：

通过pdf内容识别，找到表格所属的页（只是提高一些速度，减少其他内容），然后将表格所在的页面截取到新的pdf文件中,然后将新生成的pdf转换为html文件，通过算法重新组装表格，此方法可识别空白列以及一个表格中存在多行数据的情况

用的技术框架：

jsoup，itextpdf，pdfbox

/** * 读取pdf文件转为list集合 * @param pdfPath * @return */ public static List<List<String>> getDataFromPdf(String pdfPath){ List<List<String>> datas=new ArrayList<>(); String newPdfPath=pdfPath.replace(".pdf","_01.pdf"); String htmlPath=pdfPath.replace(".pdf","_01.html"); //确认附件表格所在的页面，返回页码 int[] pageNums=readPdf(pdfPath); //读取存在表格附件的页面 partitionPdfFile(pdfPath,newPdfPath,pageNums[0],pageNums[1]); byte[] bytes = getBytes(newPdfPath); try (BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(htmlPath)),"UTF-8"));){ //加载PDF文档 PDDocument document = PDDocument.load(bytes); PDFDomTree pdfDomTree = new PDFDomTree(); pdfDomTree.writeText(document,out); datas=ParseHtml(htmlPath); } catch (Exception e) { e.printStackTrace(); }finally { //删除缓存文件 File pdf_01=new File(newPdfPath); if(pdf_01.exists()){ pdf_01.delete(); } File html_01=new File(htmlPath); if(html_01.exists()){ html_01.delete(); } } return datas; } /*** * 读取pdf 确定内容所在页 * @param pdfPath */ private static int[] readPdf(String pdfPath){ int[] pageNums=new int[2]; try { PdfReader reader = new PdfReader(pdfPath); int pageNum = reader.getNumberOfPages(); boolean isGo=false; for(int i=1;i<=pageNum;i ){ String pageContent = PdfTextExtractor.getTextFromPage(reader, i);//读取第i页的文档内容 if((pageContent.trim().length()>0&&pageContent.startsWith("附件"))){ pageNums[0]=i; isGo=true; } if(isGo&&pageContent.trim().length()<50){ pageNums[1]=i-1; //break; } } } catch (Exception e) { e.printStackTrace(); }finally{ } return pageNums; }

/** * pdf 转换为html * @param html * @return * @throws IOException */ private static List<List<String>> ParseHtml(String html) throws IOException { org.jsoup.nodes.Document document = Jsoup.parse(new File(html), "utf-8"); Elements postItems = document.select("div.page"); //循环处理每页 List<List<String>> datas=new ArrayList<>(); for (int i=0;i<postItems.size()-1;i ) { //border-bottom Elements table_row= postItems.get(i).select("[style*=border-bottom:]"); if(table_row.size()==0) continue; //输出表格第一行 String css=table_row.first().attr("style"); String width=(process(css,"width")); //获取除标题部分内容区域 table_row=postItems.get(i).select(String.format("[style*=border-bottom:][style*=width:%s]",width)); Elements table_col= postItems.get(i).select("[style*=border-right:]"); for (int iw=(i==0?1:0);iw<table_row.size()-1;iw ) { datas.add(getRow(postItems.get(i), table_row, table_col, iw)); } } return datas; } /** * 读取pdf指定页内容 * @param pdfFile * @param newFile * @param from * @param end */ private static void partitionPdfFile(String pdfFile,String newFile, int from, int end) { Document document = null; Pdfcopy copy = null; try { PdfReader reader = new PdfReader(pdfFile); int n = reader.getNumberOfPages(); if (end == 0) { end = n; } document = new Document(reader.getPageSize(1)); copy = new PdfCopy(document, new FileOutputStream(newFile)); document.open(); for (int j = from; j <= end; j ) { document.newPage(); PdfImportedPage page = copy.getImportedPage(reader, j); copy.addPage(page); } document.close(); } catch (Exception e) { e.printStackTrace(); } } /* 将文件转换为byte数组 */ private static byte[] getBytes(String filePath){ byte[] buffer = null; try { File file = new File(filePath); FileInputStream fis = new FileInputStream(file); ByteArrayOutputStream bos = new ByteArrayOutputStream(1000); byte[] b = new byte[1000]; int n; while ((n = fis.read(b)) != -1) { bos.write(b, 0, n); } fis.close(); bos.close(); buffer = bos.toByteArray(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return buffer; }

下面是html的解析方式，通过边框定位，找到每一行每一列所处的位置一级在该位置所属的元素。

/** * 从第二行开始（去除标题行） * @param postItem * @param table_col * @param index * @return */ private static List<String> getRow(Element postItem,Elements postItems,Elements table_col,int index) { String top = (process(postItems.get(index).attr("style"), "top")); String bottom = (process(postItems.get(index 1).attr("style"), "top")); Elements tables = postItem.select("[style*=top:]"); List<String> data = new ArrayList<>(); double dbottom = Double.parseDouble(bottom); double dtop = Double.parseDouble(top); boolean isGo = false; for (int iiy = 0; iiy < table_col.size() - 1; iiy ) { StringBuilder sbs = new StringBuilder(); for (Element spostItem : tables) { String top2 = (process(spostItem.attr("style"), "top")); double top2s = Double.parseDouble(top2); if (top2s > dtop && top2s < dbottom) { String left2 = (process(spostItem.attr("style"), "left")); double[] cols = getRowCol(table_col, iiy); double left2s = Double.parseDouble(left2); if (left2s > cols[0] && left2s < cols[1]) { sbs.append(spostItem.text()); } } } if(sbs.length()==0) { data.add("-"); }else{ data.add(sbs.toString()); } } return data; } /** * 定位列的位置 * @param table_col * @param index * @return */ private static double[] getRowCol(Elements table_col,int index){ StringBuilder sbd=new StringBuilder(); String left=(process(table_col.get(index).attr("style"),"left")); String right=(process(table_col.get(index 1).attr("style"),"left")); return new double[]{Double.parseDouble(left),Double.parseDouble(right)}; } /** * 读取html中样式的指定属性 * @param style * @param extract * @return */ private static String process(String style,String extract) { if (style.contains(extract)) { style = style.substring(style.indexOf(extract ":")); style = style.substring(0, style.indexOf(";")); String attr = style.substring(style.indexOf(":") 1); return (attr.substring(0,attr.length()-2)); } return null; }

pom配置

<dependency> <groupId>com.itextpdf</groupId> <artifactId>itextpdf</artifactId> <version>5.5.13</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.12.1</version> </dependency> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.5</version> </dependency> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>fontbox</artifactId> <version>2.0.0</version> </dependency> <dependency> <groupId>com.itextpdf.tool</groupId> <artifactId>xmlworker</artifactId> <version>5.5.11</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>ooxml-schemas</artifactId> <version>1.1</version> </dependency>

怎样提取pdf表格的内容（关于PDF文件表格提取实现）

最新推荐

热门推荐