目标:将不同格式的JSON文件存入MySQL数据库
涉及的点有: 1. java处理JSON对象,直接见源码。 2. java.sql.SQLException: Incorrect string value: ‘\xF0\x9F\x99\x8F\xE5\x8D…’ for column ‘text’ at row 1报错问题,报错原因:因为我没有对插入文本做任何处理,文本内有不同字节的utf8字符,我的处理方式就是过滤后再插入,因为特殊的字符其实也没什么用。1 public static String StringFilter(String str) throws PatternSyntaxException { 2 // 清除掉所有特殊字符,只允许汉字字母数字和某些常见符号出现 3 String regEx = "[^0-9a-zA-Z\u4e00-\u9fa5~!@#$%^&*()+=|{}':;',//[//].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。,、?]+"; 4 Pattern p = Pattern.compile(regEx); 5 Matcher m = p.matcher(str); 6 return m.replaceAll(" ").trim().replaceAll("s+", " "); 7 } 8 9 public static void insertMicroblogs(String uid, String mid, String time,String geo,String source,10 String repost,String comment,String attitude,String text,int flag,Statement statement) throws Exception {11 if (uid == null) return;12 String sql = "insert into data2016.microblog values(\"" + mid+ "\",\"" + uid + "\",\"" + time13 + "\",\"" + geo+ "\",\"" + source + "\",\"" +repost + "\",\"" + comment + "\",\"" + attitude14 + "\",\"" + text+ "\"," + flag + ")";15 String selectsql = "select count(*) from data2016.microblog where uid = \""+ uid+ "\" and mid=\"" + mid + "\"";16 17 try {18 ResultSet rset = statement.executeQuery(selectsql);19 int rowCount = 0; //记录查询结果记录数20 if(rset.next()) { 21 rowCount=rset.getInt(1); 22 }23 if(rowCount == 0){24 statement.execute(sql);25 }26 27 } catch (Exception e) {28 pErrorUser.println("microblog:"+uid); 29 // System.out.println(sql);30 e.printStackTrace();31 return;32 }33 }34 public static void readUserMicroblogs() throws Exception{35 File file = new File("data/source/first/microblogs/microblogs12.txt"); 36 BufferedInputStream fis = new BufferedInputStream(new FileInputStream(file)); 37 BufferedReader reader = new BufferedReader(new InputStreamReader(fis,"utf-8"),5*1024*1024);// 用5M的缓冲读取文本文件 38 39 String line = "";40 int count = 0;41 while((line = reader.readLine()) != null){42 count++;43 JSONObject blogObject = new JSONObject(line);44 String uid = blogObject.getString("uid");45 String microblogs = blogObject.getString("microblogs");46 JSONArray microblogsArray = new JSONArray(microblogs);47 int size = microblogsArray.length();48 // System.out.println("Size:" + size);49 for (int i = 0; i < size; i++) {50 JSONObject jsonObj = microblogsArray.getJSONObject(i);51 String mid=jsonObj.get("mid").toString();52 String created_at=jsonObj.get("created_at").toString();53 String geo=StringFilter(jsonObj.get("geo").toString().replaceAll("\"", " "));54 String source=jsonObj.get("source").toString().replaceAll("\"", " ");55 String reposts_count=jsonObj.get("reposts_count").toString();56 String comments_count=jsonObj.get("comments_count").toString();57 String attitudes_count=jsonObj.get("attitudes_count").toString();58 String text;59 int flag;60 if(jsonObj.has("retweeted_status")){61 flag=1;62 String retweet = jsonObj.get("retweeted_status").toString();63 JSONObject weibo = new JSONObject(retweet);64 text=StringFilter(weibo.get("text").toString().replaceAll("\"", " "));65 }else{66 flag=0;67 text=StringFilter(jsonObj.get("text").toString().replaceAll("\"", " "));68 }69 insertMicroblogs(uid,mid,created_at,geo,source,reposts_count,comments_count,attitudes_count,text,flag,statement);70 71 }72 73 if(count%50==0){74 System.out.print(count +"...");75 }76 if(count%1000==0){77 System.out.println();78 }79 }80 }
如上代码是我对新浪微博数据文档的存入工作,因为文档较大,所以加入了缓存读取。遇到的其他问题基本都是特殊字符问题,其中插入文本中有双引号,原本处理方法是:
String source=jsonObj.get("source").toString().replaceAll("\"", "\\\" ");
但是不知道为嘛,转义字符没有成功加入,所以就直接将双引号替换为空格处理了。这一块内容原本不打算记,原以为是很顺利的事还是倒腾了一天,所以小记一下咯。