java去掉html標(biāo)簽的方法:
1、通過純正則方法去掉html標(biāo)簽;
2、使用“javax.swing.text.html.HTMLEditorKit”去掉html標(biāo)簽;
3、通過使用Jsoup框架去掉html標(biāo)簽等等。
一.純正則方法
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class HTMLSpirit{
public static String delHTMLTag(String htmlStr){
String regEx_script="<script[^>]*?>[\\s\\S]*?<\\/script>"; //定義script的正則表達(dá)式
String regEx_style="<style[^>]*?>[\\s\\S]*?<\\/style>"; //定義style的正則表達(dá)式
String regEx_html="<[^>]+>"; //定義HTML標(biāo)簽的正則表達(dá)式
Pattern p_script=Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE);
Matcher m_script=p_script.matcher(htmlStr);
htmlStr=m_script.replaceAll(""); //過濾script標(biāo)簽
Pattern p_style=Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE);
Matcher m_style=p_style.matcher(htmlStr);
htmlStr=m_style.replaceAll(""); //過濾style標(biāo)簽
Pattern p_html=Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE);
Matcher m_html=p_html.matcher(htmlStr);
htmlStr=m_html.replaceAll(""); //過濾html標(biāo)簽
return htmlStr.trim(); //返回文本字符串
}
}
二.使用 javax.swing.text.html.HTMLEditorKit
import java.io.IOException;
import java.io.FileReader;
import java.io.Reader;
import java.util.List;
import java.util.ArrayList;
import javax.swing.text.html.parser.ParserDelegator;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.MutableAttributeSet;
public class HTMLUtils {
private HTMLUtils() {}
public static List<String> extractText(Reader reader) throws IOException {
final ArrayList<String> list = new ArrayList<String>();
ParserDelegator parserDelegator = new ParserDelegator();
ParserCallback parserCallback = new ParserCallback() {
public void handleText(final char[] data, final int pos) {
list.add(new String(data));
}
public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) { }
public void handleEndTag(Tag t, final int pos) { }
public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { }
public void handleComment(final char[] data, final int pos) { }
public void handleError(final java.lang.String errMsg, final int pos) { }
};
parserDelegator.parse(reader, parserCallback, true);
return list;
}
public final static void main(String[] args) throws Exception{
FileReader reader = new FileReader("java-new.html");
List<String> lines = HTMLUtils.extractText(reader);
for (String line : lines) {
System.out.println(line);
}
}
}
三.使用Jsoup框架
import java.io.IOException;
import java.io.FileReader;
import java.io.Reader;
import java.io.BufferedReader;
import org.jsoup.Jsoup;
public class HTMLUtils {
private HTMLUtils() {}
public static String extractText(Reader reader) throws IOException {
StringBuilder sb = new StringBuilder();
BufferedReader br = new BufferedReader(reader);
String line;
while ( (line=br.readLine()) != null) {
sb.append(line);
}
String textOnly = Jsoup.parse(sb.toString()).text();
return textOnly;
}
public final static void main(String[] args) throws Exception{
FileReader reader = new FileReader
("C:/RealHowTo/topics/java-language.html");
System.out.println(HTMLUtils.extractText(reader));
}
四.使用Apache Tika
mport java.io.FileInputStream;
import java.io.InputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
public class ParseHTMLWithTika {
public static void main(String args[]) throws Exception {
InputStream is = null;
try {
is = new FileInputStream("C:/Temp/java-x.html");
WriteOutContentHandler contenthandler = new WriteOutContentHandler(100000000);
Metadata metadata = new Metadata();
Parser parser = new AutoDetectParser();
parser.parse(is, contenthandler, metadata, new ParseContext());
System.out.println(contenthandler.toString());
}
catch (Exception e) {
e.printStackTrace();
}
finally {
if (is != null) is.close();
}
}
}
注意這里經(jīng)過本人實驗有個小坑,WriteOutContentHandler參數(shù)是限制的字符數(shù),這個如果不設(shè)置默認(rèn)是1萬,超過會報異常。
具體的jar包請自行到中央倉庫里搜索依賴配置
https://search.maven.org/ 和 https://mvnrepository.com/文章來源:http://www.zghlxwxcb.cn/news/detail-646306.html
工具類文章來源地址http://www.zghlxwxcb.cn/news/detail-646306.html
public class ResourceUtil {
/**
* 根據(jù)當(dāng)前類路徑,獲取資源文件夾對應(yīng)文件的所有字符串
*
* @param currentClass 如 this.class
* @param resourcePath 如 /data/json/xxx.json (相對于resources文件夾)
*/
public static String resource2String(Class currentClass, String resourcePath) throws IOException {
return IOUtils.toString(new FileReader(new File(currentClass.getResource(resourcePath).getFile())));
}
}
到了這里,關(guān)于java怎么去掉html標(biāo)簽的文章就介紹完了。如果您還想了解更多內(nèi)容,請在右上角搜索TOY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!