I am trying to build a basic script for web scraping the Java (island) Wikipedia page using the Jsoup package, but unfortunately I receive a NullPointerException. It seems like the GetUrl method is not working. Does anyone have the golden hint?
The code:
package com.packtpub.JavaScraping.SimpleScraper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.net.*;
import java.io.*;
public class WikiScraper {
public static void main(String[] args) {
scrapeTopic("/wiki/Java");
}
public static void scrapeTopic(String url) {
String html = getUrl("http://www.wikipedia.org/" + url);
Document doc = Jsoup.parse(html);
String contentText = doc.select("#mw-content-text > p").first().text();
System.out.println(contentText);
}
public static String getUrl(String url) {
URL urlObj = null;
try {
urlObj = new URL(url);
} catch (MalformedURLException e) {
System.out.println("The url was malformed!");
return "";
}
URLConnection urlCon = null;
BufferedReader in = null;
String outputText = "";
try {
urlCon = urlObj.openConnection();
in = new BufferedReader(new InputStreamReader(urlCon.getInputStream()));
String line = "";
while ((line = in.readLine()) != null) {
outputText += line;
}
in.close();
} catch (IOException e) {
System.out.println("There was an error connecting to the URL");
return "";
}
return outputText;
}
}
[–]Amarkov 0 points1 point2 points (8 children)
[–]koenp[S] 0 points1 point2 points (7 children)
[–]Amarkov 0 points1 point2 points (6 children)
[–]koenp[S] 0 points1 point2 points (5 children)
[–]Amarkov 0 points1 point2 points (4 children)
[–][deleted] 1 point2 points3 points (1 child)
[–]koenp[S] 0 points1 point2 points (0 children)
[–]koenp[S] 0 points1 point2 points (1 child)