Commit 52ba18e3 authored by eddie.woo's avatar eddie.woo

add

parent 660c9f0d
......@@ -278,6 +278,24 @@
<version>3.9.1</version> <!-- Stay on 1.7.1 to support Java 6 -->
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.17</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.17</version>
</dependency>
<dependency>
<groupId>com.javaaxp</groupId>
<artifactId>java-axp</artifactId>
<version>1.0</version>
<scope>system</scope>
<systemPath>${basedir}/lib/java-axp-1.0-SNAPSHOT.jar</systemPath>
</dependency>
</dependencies>
<build>
<finalName>atms-api</finalName>
......
package pwc.taxtech.atms.common;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BodyContentHandler;
import java.io.File;
import java.io.FileInputStream;
public class PDFTest {
public static void main(String[] args) {
try {
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
FileInputStream inputstream = new FileInputStream(new File("C:\\woo\\test.pdf"));
ParseContext pcontext = new ParseContext();
PDFParser pdfparser = new PDFParser();
pdfparser.getPDFParserConfig().setSortByPosition(true);//参见底层实现
// pdfparser.getPDFParserConfig().setEnableAutoSpace(false);
// pdfparser.getPDFParserConfig().setAverageCharTolerance(1f);
// pdfparser.getPDFParserConfig().setSpacingTolerance(20f);
pdfparser.parse(inputstream, handler, metadata, pcontext);
System.out.println(handler.toString());
} catch (Exception e) {
e.printStackTrace();
}
}
}
package pwc.taxtech.atms.common;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.formula.eval.ValueEval;
import org.apache.poi.ss.formula.functions.FreeRefFunction;
import org.apache.poi.ss.formula.udf.AggregatingUDFFinder;
import org.apache.poi.ss.formula.udf.DefaultUDFFinder;
import org.apache.poi.ss.formula.udf.UDFFinder;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.ss.util.CellReference;
import java.io.*;
public class POITest {
public static void main(String[] args) {
File workbookFile = new File("C:\\source\\test - Copy.xlsx");
try {
FileInputStream fis = new FileInputStream(workbookFile);
Workbook workbook = WorkbookFactory.create(fis);
String[] functionNames = {"TmpFunction"};
FreeRefFunction[] functionImpls = {new TmpFunction()};
UDFFinder udfs = new DefaultUDFFinder(functionNames, functionImpls);
UDFFinder udfToolpack = new AggregatingUDFFinder(udfs);
workbook.addToolPack(udfToolpack);
FormulaEvaluator evaluator = workbook.getCreationHelper().createFormulaEvaluator();
int sheetNum = workbook.getNumberOfSheets();
Sheet st1 = workbook.getSheetAt(0);
st1.getRow(1).getCell(0).setCellType(CellType.NUMERIC);
st1.getRow(1).getCell(0).setCellValue(15);
evaluator.evaluateAll();
for (int i = 0; i < sheetNum; i++) {
Sheet tmpSheet = workbook.getSheetAt(i);
for (int r = tmpSheet.getFirstRowNum(); r <= tmpSheet.getLastRowNum(); r++) {
Row row = tmpSheet.getRow(r);
if (null == row) {
continue;
}
for (int c = row.getFirstCellNum(); c <= row.getLastCellNum(); c++) {
System.out.println("row :" + r + " cell: " + c);
Cell tmp = row.getCell(c);
if (null != tmp && tmp.getCellTypeEnum().equals(CellType.FORMULA)) {
// CellValue v = evaluator.evaluate(tmp);
tmp.setCellType(CellType.NUMERIC);
tmp.setCellValue(tmp.getNumericCellValue());
}
}
}
}
FileOutputStream excelFileOutPutStream = new FileOutputStream("C:\\source\\test - Copy.xlsx");
workbook.write(excelFileOutPutStream);
excelFileOutPutStream.flush();
excelFileOutPutStream.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (InvalidFormatException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
package pwc.taxtech.atms.common;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import java.io.*;
public class TikaTest {
public static void main(String[] args) {
try {
// Tika tika = new Tika();
File xpsFile = new File("C:\\woo\\海关稽核结果.xps");
InputStream inputStream = new FileInputStream(xpsFile);
// String FileName = xpsFile.getName();
// Metadata metadata = new Metadata();
// if (FileName != null && FileName.length() > 0)
// metadata.add(Metadata.RESOURCE_NAME_KEY, FileName);
// String MimeType = tika.detect(inputStream, metadata);
//
// metadata.add(Metadata.CONTENT_TYPE, MimeType);
// inputStream.close();
// inputStream = new FileInputStream(xpsFile);
// Reader reader = tika.parse(inputStream, metadata);
// String content = IOUtils.toString(reader);
// inputStream.close();
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new TmpXPSParser().parse(inputStream, handler, metadata, new ParseContext());
String content = handler.toString();
System.out.println(content);
} catch (Exception e) {
e.printStackTrace();
}
}
}
package pwc.taxtech.atms.common;
import org.apache.poi.ss.formula.OperationEvaluationContext;
import org.apache.poi.ss.formula.eval.*;
import org.apache.poi.ss.formula.functions.FreeRefFunction;
import org.apache.poi.ss.util.CellReference;
public class TmpFunction implements FreeRefFunction {
@Override
public ValueEval evaluate(ValueEval[] valueEvals, OperationEvaluationContext operationEvaluationContext) {
// if (valueEvals.length != 3) {
// return ErrorEval.VALUE_INVALID;
// }
try {
ValueEval v1 = OperandResolver.getSingleValue(valueEvals[0],
operationEvaluationContext.getRowIndex(),
operationEvaluationContext.getColumnIndex());
ValueEval v2 = OperandResolver.getSingleValue(valueEvals[1],
operationEvaluationContext.getRowIndex(),
operationEvaluationContext.getColumnIndex());
String val1 = OperandResolver.coerceValueToString(v1);
int val2 = OperandResolver.coerceValueToInt(v2);
CellReference reference = new CellReference(val1);
return new NumberEval(operationEvaluationContext.getWorkbook().getSheet(0)
.getCell(reference.getRow(),reference.getCol()).getNumericCellValue() + val2);
} catch (EvaluationException e) {
e.printStackTrace();
}
return null;
}
}
package pwc.taxtech.atms.common;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import javaaxp.core.service.IXPSAccess;
import javaaxp.core.service.IXPSPageAccess;
import javaaxp.core.service.XPSError;
import javaaxp.core.service.impl.XPSServiceImpl;
import javaaxp.core.service.impl.document.jaxb.CTCanvas;
import javaaxp.core.service.impl.document.jaxb.CTGlyphs;
import javaaxp.core.service.impl.document.jaxb.CTPath;
import javaaxp.core.service.model.document.page.IFixedPage;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
public class TmpXPSParser implements Parser {
private double currentXPosition = 0;
/**
*
*/
private static final long serialVersionUID = -3528366722867144747L;
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(MediaType.application("vnd.ms-xpsdocument"));
private static final String XPS_MIME_TYPE = "application/vnd.ms-xpsdocument";
private XHTMLContentHandler fileXHTML;
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
metadata.set(Metadata.CONTENT_TYPE, XPS_MIME_TYPE);
fileXHTML = new XHTMLContentHandler(handler, metadata);
try {
parseXPS(stream);
} catch (XPSError e) {
throw new IOException(e);
}
stream.close();
}
private void parseXPS(InputStream inputStream) throws XPSError, SAXException {
IXPSAccess xpsAccess = XPSServiceImpl.getInstance().getXPSAccess(inputStream);
xhtmlStartDocument();
int firstDocNum = xpsAccess.getDocumentAccess().getFirstDocNum();
int lastDocNum = xpsAccess.getDocumentAccess().getLastDocNum();
for (int i = firstDocNum; i <= lastDocNum; i++) {
IXPSPageAccess xpsPageAccess = xpsAccess.getPageAccess(i);
int firstPageNum = xpsPageAccess.getFirstPageNum();
int lastPageNum = xpsPageAccess.getLastPageNum();
for (int j = firstPageNum; j <= lastPageNum; j++) {
IFixedPage fixedPage = xpsPageAccess.getPage(j);
parseObjs(fixedPage.getPathOrGlyphsOrCanvas());
}
}
xhtmlEndDocument();
}
private void parseObjs(List<Object> objs) throws XPSError, SAXException {
for (Object o : objs)
parseObj(o);
}
private void parseObj(Object xpsObj) throws XPSError, SAXException {
if (xpsObj instanceof CTCanvas) {
CTCanvas c = (CTCanvas) xpsObj;
xhtmlStartCanvas();
parseObjs(c.getPathOrGlyphsOrCanvas());
xhtmlEndCanvas();
} else if (xpsObj instanceof CTGlyphs) {
CTGlyphs c = (CTGlyphs) xpsObj;
if (c.getOriginX() < currentXPosition) {
fileXHTML.startElement("div");
fileXHTML.characters(" ");
fileXHTML.endElement("div");
}
String text = c.getUnicodeString();
xhtmlParagraph(text);
currentXPosition = c.getOriginX();
} else if (xpsObj instanceof CTPath) {
} else {
System.out.println("Unhandled type : " + xpsObj.getClass().getCanonicalName());
}
}
private void xhtmlStartDocument() throws SAXException {
fileXHTML.startDocument();
}
private void xhtmlEndDocument() throws SAXException {
fileXHTML.endDocument();
}
private void xhtmlStartCanvas() throws SAXException {
fileXHTML.startElement("div");
}
private void xhtmlEndCanvas() throws SAXException {
fileXHTML.endElement("div");
}
private void xhtmlParagraph(String text) throws SAXException {
fileXHTML.startElement("span");
fileXHTML.characters(text);
fileXHTML.endElement("span");
}
/**
* @deprecated This method will be removed in Apache Tika 1.0.
*/
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
throws IOException, SAXException, TikaException {
parse(stream, handler, metadata, new ParseContext());
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment