下面是自己编写的 网络抓取功能实现 将获取的结果进行过滤并写入到TXT文档中 (以防忘记) 原创哟
import java.io.BufferedReader;
import java.io.BufferedWriter;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.FileWriter;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.ObjectOutputStream;import java.net.HttpURLConnection;import java.net.URL;import java.util.regex.Matcher;import java.util.regex.Pattern;public class WebClient4 {
public static String getWebContent(String urlString, final String charset, int timeout) throws IOException { if (urlString == null || urlString.length() == 0) { return null; } urlString = (urlString.startsWith("http://") || urlString .startsWith("https://")) ? urlString : ("http://" + urlString) .intern(); URL url = new URL(urlString); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); conn .setRequestProperty( "User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); conn.setRequestProperty("Accept", "text/html"); conn.setConnectTimeout(timeout); try { if (conn.getResponseCode() != HttpURLConnection.HTTP_OK) { return null; } } catch (IOException e) { e.printStackTrace(); return null; } InputStream input = conn.getInputStream(); BufferedReader reader = new BufferedReader(new InputStreamReader(input, charset)); String line = null; StringBuffer sb = new StringBuffer(); while ((line = reader.readLine()) != null) { sb.append(line).append("\r\n"); } if (reader != null) { reader.close(); } if (conn != null) { conn.disconnect(); } return sb.toString();}
public static String getWebContent(String urlString) throws IOException {
return getWebContent(urlString, "iso-8859-1", 5000); }// public static void main(String[] args) throws IOException {
// String s = getWebContent("http://music.baidu.com/top/new");// s = new String(s.getBytes("iso-8859-1"), "utf-8");// System.out.println("开始");// String regex="<span.*?><a.*?>(.*?)</a></span>";// Pattern p =Pattern.compile(regex);// Matcher m=p.matcher(s);// while(m.find()){ // System.out.println(m.group(1));// }// System.out.println("结束"); System.out.println(s);// } public static void main(String[] args) throws IOException { System.out.println("开始");// 这里为要变更你自己需要抓取的网络地址
// String s = getWebContent("http://music.baidu.com/top/new"); String s = getWebContent("http://music.baidu.com/top/new/week/");// 这里的字体设置是 将页面的IOS-8859-1 转换为UTF-8 编码格式 方便后面将查询出的数据写入到txt文档中
s = new String(s.getBytes("iso-8859-1"), "utf-8");// System.out.println(s);// 第一种方式 这是获取本页面上的过滤条件
// String regex="<a href=\\\"/song/.*?\\\" title=\\\"(.*?)\\\">"; // 第二种方式 这是获取本页面上的过滤条件 String regex="<a class=\\\"song-link\\\" href=\\\"/song/.*?\\\" title=\\\"(.*?)\\\">";// Pattern 和 Matcher 为正则表达式的方法 为设置过滤条件
Pattern p =Pattern.compile(regex); Matcher m=p.matcher(s); String line2 = null;// 定义StringBuffer 用来存储字符串方便后期写入文档
StringBuffer sb2 = new StringBuffer(); while(m.find())// 打印出满足条件的数据
System.out.println(m.group(1)); line2 = m.group(1); sb2.append(line2).append("\r\n"); } wirteString("F:/cc2.txt",sb2.toString());// System.out.println(s); System.out.println("结束"); } /* 往文件写入字符串 */ public static void wirteString(String path, String context) { try { /* 创建写入对象 */ FileWriter fileWriter = new FileWriter(path); /* 创建缓冲区 */ BufferedWriter writer = new BufferedWriter(fileWriter); /* 写入字符串 */ writer.write(context); /* 关掉对象 */ writer.close(); System.out.println("写入字符串成功!"); } catch (IOException e) { e.printStackTrace(); } } /* 将对象写入文件方法 */ public void write(Object o, String path) { try { /* 创建存取文件 */ FileOutputStream fileStream = new FileOutputStream(path); /* 将存取文件写入对象 */ ObjectOutputStream os = new ObjectOutputStream(fileStream); /* 写入对象 */ os.writeObject(o); System.out.println("写入数据成功"); /* 关闭ObjectOutputStream */ os.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }}