中科院分词工具的使用
中科院分词工具java的配置与里面自带的讲解相同,下面是代码
package xieru; import hello.Hello.CLibrary; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; import java.util.regex.Pattern; import com.csvreader.CsvReader; import com.csvreader.CsvWriter; import com.sun.jna.Library; import com.sun.jna.Native; public class WriteSeparatewords { public interface CLibrary extends Library { // 定义并初始化接口的静态变量 CLibrary Instance = (CLibrary) Native.loadLibrary( "E:\\workplace\\hello\\NLPIR", CLibrary.class); // printf函数声明 public boolean NLPIR_Init(byte[] sDataPath, int encoding, byte[] sLicenceCode); public String NLPIR_ParagraphProcess(String sSrc, int bPOSTagged); public String NLPIR_GetKeyWords(String sLine,int nMaxKeyLimit,boolean bWeightOut); public void NLPIR_Exit(); } public static String transString(String aidString, String ori_encoding, String new_encoding) { try { return new String(aidString.getBytes(ori_encoding), new_encoding); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return null; } public static void it(String inFile,String outFile)throws IOException { File file=new File(inFile); FileInputStream fi=new FileInputStream(file); CsvReader cr=new CsvReader(fi,',', Charset.forName("GBK")); cr.readHeaders(); String[] readerS; FileWriter wr=new FileWriter(outFile); BufferedWriter bw=new BufferedWriter(wr); while(cr.readRecord()){ readerS=cr.getValues(); if(readerS[10].equals("技术")){ for(int i=0;i<readerS.length-1;i++) bw.write("\""+readerS[i]+"\""+","); bw.write("\""+readerS[readerS.length-1]+"\""); bw.newLine(); } } bw.flush(); bw.close(); } public static void fenci(String inFile,String outFile) throws IOException{ String argu = ""; String system_charset = "UTF-8"; int charset_type = 1; // int charset_type = 0; // 调用printf打印信息 if (!CLibrary.Instance.NLPIR_Init(argu.getBytes(system_charset), charset_type, "0".getBytes(system_charset))) { System.err.println("初始化失败!"); } String filePath=inFile; File file=new File(filePath); FileInputStream fi=new FileInputStream(file); CsvReader cr=new CsvReader(fi,',', Charset.forName("GBK")); cr.readHeaders(); String[] readerS; FileWriter wr=new FileWriter(outFile); BufferedWriter bw = new BufferedWriter(wr); while(cr.readRecord()){ readerS=cr.getValues(); String nativeBytes=null; nativeBytes = CLibrary.Instance.NLPIR_ParagraphProcess(readerS[1], 3); bw.write(readerS[0]+","+"\""+nativeBytes+"\""+","); System.out.println("分词结果为: " + nativeBytes); String nativeByte = CLibrary.Instance.NLPIR_GetKeyWords(readerS[1],10,true); bw.write(nativeByte); System.out.println("关键词提取结果是:"+nativeByte); bw.newLine(); System.out.println("-----------------------------------"); } bw.flush(); bw.close(); CLibrary.Instance.NLPIR_Exit(); } public static void main(String[] args) throws Exception { // WriteSeparatewords.fenci("F:/c/zhiweiyaoqiu.csv", "F:/c/fenci.csv"); WriteSeparatewords.it("F:/c/zhaopinxinxi.csv", "F:/c/it.csv"); } }