Hello,
I am new to Kotlin and I decided to compare the performance of a PDF text extraction using Kotlin and Java approach with PDFBox 2.0.12. library. Here is the Java version that extracts the text from a pdf and creates a list of text segments of the specified minimal size:
...
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
...
public static List<Segment> parse(InputStream is) throws Exception {
int segNo = 0;
List<Segment> segments = new ArrayList<>();
try (PDDocument pdfDocument = PDDocument.load(is)) {
if (!pdfDocument.isEncrypted()) {
PDFTextStripper str = new PDFTextStripper();
str.setLineSeparator("\n");
str.setSortByPosition(true);
StringBuilder accumulator = new StringBuilder();
int n = pdfDocument.getNumberOfPages();
for(int i = 0; i < n; i++) {
str.setStartPage(i); str.setEndPage(i);
String[] pars = Segment.PDF_END_OF_LINE.split(str.getText(pdfDocument).trim());
for(String content : pars) {
content = content.trim();
if(!content.isEmpty()) {
accumulator.append(Segment.REMOVE_MULTI_SPACES.matcher(content)
.replaceAll(" ")).append(".");
if(accumulator.length() < Segment.MIN_NUM_OF_CHARS) {
accumulator.append("\n");
} else {
segments.add(new Segment(Segment.PARAGRAPH,
accumulator.toString().trim(), segNo++));
accumulator.setLength(0);
}
}
}
if(accumulator.length() > 0) {
segments.add(new Segment(Segment.PARAGRAPH,
accumulator.toString().trim(), segNo++));
accumulator.setLength(0);
}
}
}
}
return segments;
}
Here is my Kotlin code:
...
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.text.PDFTextStripper
...
object PdfParser {
fun parse(fis: InputStream): List<Segment> {
var segNo = 0
val segments = mutableListOf<Segment>()
PDDocument.load(fis).use { pdfDocument ->
if (!pdfDocument.isEncrypted) {
val stripper = PDFTextStripper()
stripper.lineSeparator= "\n"
stripper.sortByPosition= true
val accumulator = StringBuilder()
val n = pdfDocument.numberOfPages
for (i in 0 until n) {
stripper.startPage = i
stripper.endPage = i
val pars = PDF_END_OF_SENT.split(stripper.getText(pdfDocument).trim())
for (p in pars) {
val content = p.trim()
if (content.isNotEmpty()) {
accumulator.append(SEG_REMOVE_MULTI_SPACES.replace(content, " ") + ".")
if (accumulator.length < SEG_MIN_NUM_OF_CHARS) {
accumulator.append("\n")
} else {
segments.add(Segment(SEG_PARAGRAPH, accumulator.toString().trim(), segNo++))
accumulator.setLength(0)
}
}
}
if (accumulator.isNotEmpty()) {
segments.add(Segment(SEG_PARAGRAPH, accumulator.toString().trim(), segNo++))
accumulator.setLength(0)
}
}
}
}
return segments
}
}
When I measure the performance (giving the same FileInputStream pointing to 7.18MB tornadofx-guide.pdf), Java version performs the task in 3 secs and Kotlin version in 5 secs!
In both cases I use the same jdk version (1.8.something and same PDFBox libraries).
Here are the test sequences in Java and Kotlin:
// Java
try(InputStream is = new FileInputStream("C:\\Users\\Andjelko\\Desktop\\tornadofx-guide.pdf")){
long st = System.currentTimeMillis();
List<Segment> list = parse(is);
long e = System.currentTimeMillis();
System.out.println(e-st);
} catch(Exception e) {
e.printStackTrace();
}
// Kotlin
val fis = FileInputStream("C:\\Users\\Andjelko\\Desktop\\tornadofx-guide.pdf")/*, 2 * 1024)*/
val st = System.currentTimeMillis()
val list = PdfParser.parse(fis)
val e = System.currentTimeMillis()
println(e - st)
Am I missing something or Kotlin is much slower than Java (apart from being much nicer)?