/** * Copyright (c) 2015-2016, Michael Yang 杨福海 (fuhai999@gmail.com). * * Licensed under the GNU Lesser General Public License (LGPL) ,Version 3.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.gnu.org/licenses/lgpl-3.0.txt * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.landtool.lanbase.common.utils; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * @author lanbase * @Description: TODO(html处理工具) * @date 2017-6-23 15:07 */ public class JsoupUtils { public static String getFirstImageSrc(String html) { if (html == null) return null; Elements es = Jsoup.parseBodyFragment(html).select("img"); if (es != null && es.size() > 0) return es.first().attr("src"); return null; } public static List getImageSrcs(String html) { if (StringUtils.isBlank(html)) { return null; } List list = new ArrayList(); Document doc = Jsoup.parseBodyFragment(html); Elements es = doc.select("img"); if (es != null && es.size() > 0) { for (Element e : es) { list.add(e.attr("src")); } } return list; } public static String getText(String html) { return Jsoup.parse(html).text(); } public static String getBodyHtml(String html) { if (StringUtils.isNotBlank(html)) { Document document = Jsoup.parse(html); if (null != document && document.body() != null) { return document.body().html().toString(); } } return html; } static MyWhitelist whitelist = new MyWhitelist(); public static String clear(String html) { if (StringUtils.isNotBlank(html)) return Jsoup.clean(html, whitelist); return html; } /** * 做自己的白名单,允许base64的图片通过等 * * @author michael */ public static class MyWhitelist extends org.jsoup.safety.Whitelist { public MyWhitelist() { addTags("a", "b", "blockquote", "br", "caption", "cite", "code", "col", "colgroup", "dd", "div", "dl", "dt", "em", "h1", "h2", "h3", "h4", "h5", "h6", "i", "img", "li", "ol", "p", "pre", "q", "small", "strike", "strong", "sub", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "u", "ul"); addAttributes("a", "href", "title", "target"); addAttributes("blockquote", "cite"); addAttributes("col", "span"); addAttributes("colgroup", "span"); addAttributes("img", "align", "alt", "src", "title"); addAttributes("ol", "start"); addAttributes("q", "cite"); addAttributes("table", "summary"); addAttributes("td", "abbr", "axis", "colspan", "rowspan", "width"); addAttributes("th", "abbr", "axis", "colspan", "rowspan", "scope", "width"); addAttributes("video", "src", "autoplay", "controls", "loop", "muted", "poster", "preload"); addAttributes(":all", "class"); addAttributes(":all", "style"); addAttributes(":all", "height"); addAttributes(":all", "width"); addAttributes(":all", "type"); addAttributes(":all", "id"); addAttributes(":all", "name"); addProtocols("a", "href", "ftp", "http", "https", "mailto", "tel"); addProtocols("blockquote", "cite", "http", "https"); addProtocols("cite", "cite", "http", "https"); addProtocols("img", "src", "http", "https"); addProtocols("q", "cite", "http", "https"); } @Override protected boolean isSafeAttribute(String tagName, Element el, Attribute attr) { return ("img".equals(tagName) && "src".equals(attr.getKey()) && attr.getValue().startsWith("data:;base64")) || super.isSafeAttribute(tagName, el, attr); } } }