文件A.txt
1|369001|O|186600.18
2|780017|O|66219.63
3|1233140|F|270741.97
4|1367761|O|41714.38
5|444848|F|122444.33
6|556222|F|50883.96
文件B.txt
1|7759468|384484
1|3365454|365455
1|3184989|184990
2|5308487|58508
3|214849|89850
3|951772|326776
3|1468981|93984
4|4401735|276760
5|5428465|428466
5|6196340|196341
5|1876509|1513
6|6981773|106787
读取两个文件,A.txt,B.txt,两个文件的第一行为主键,当主键相等时进行连接,写文件。类似于数据库中的join连接。
A,B主键都有序,A主键重复
方案一
读取B.txt一行,然后扫描整个A.txt 主键相等时,连接
public static void main(String[] args) throws IOException {
FileInputStream A = null;
FileInputStream B = null;
InputStreamReader A_isr = null;
InputStreamReader B_isr = null;
BufferedReader A_br = null; // 用于包装InputStreamReader,提高处理性能。因为BufferedReader有缓冲的,而InputStreamReader没有。
BufferedReader B_br = null;
FileOutputStream output_fos = null;
OutputStreamWriter output_osw = null;
BufferedWriter output_bw = null;
try {
orders_tbl = new FileInputStream("D:" + File.separator + "bigfile" + File.separator + "A.txt");// FileInputStream
lineitem_tbl = new FileInputStream("D:" + File.separator + "bigfile" + File.separator + "B.txl");// FileInputStream
// 从文件系统中的某个文件中获取字节
A_isr = new InputStreamReader(A);// InputStreamReader 是字节流通向字符流的桥梁
B_isr = new InputStreamReader(B);
A_br = new BufferedReader(A_isr);// 从字符输入流中读取文件中的内容,封装了一个new InputStreamReader的对象
B_br = new BufferedReader(B_isr);
output_fos = new FileOutputStream(// 输出文件位置
new File("D:" + File.separator + "bigfile" + File.separator + "output.txt"));
output_osw = new OutputStreamWriter(output_fos, "UTF-8");
output_bw = new BufferedWriter(output_osw);
String A_line = "";
String B_line ="";
while ((A_line = A_br.readLine()) != null) {
String A_line_1[] = A_line.split("\\|", 2);
while ((B_line = B_br.readLine()) != null) {
String B_line_1[] = B_line.split("\\|", 2);
if(Integer.parseInt(A_line_1[0])==(Integer.parseInt(B_line_1[0]))) {
output_bw.write(A_line + B_line+"\n");
output_bw.flush();
}
if(Integer.parseInt(A_line_1[0])<(Integer.parseInt(B_line_1[0]))) {
break;
}
}
}
} catch (FileNotFoundException e) {
System.out.println("找不到指定文件");
} catch (IOException e) {
System.out.println("读取文件失败");
} finally {
try {
// 注意关闭的先后顺序,先打开的后关闭,后打开的先关闭
output_bw.close();
output_osw.close();
output_fos.close();
A_br.close();
B_br.close();
A_isr.close();
B_isr.close();
A.close();
B.close();
// 关闭的时候最好按照先后顺序关闭最后开的先关闭所以先关s,再关n,最后关m
} catch (IOException e) {
e.printStackTrace();
}
}
}
问题:B.txt读取一行没有问题,但是A.txt读取完成后,B.txt读取一行,A.txt已经读取完成,无法重复读取
所以求方法。(A.txt放进内存,重复读取)
方案二
投机取巧型
A.txt有序
先读取A.txt一行,再扫描B.txt,当主键相等时进行连接到同一行,不相等时候终止循环
public static void main(String[] args) throws IOException {
FileInputStream A = null;
FileInputStream B = null;
InputStreamReader A_isr = null;
InputStreamReader B_isr = null;
BufferedReader A_br = null; // 用于包装InputStreamReader,提高处理性能。因为BufferedReader有缓冲的,而InputStreamReader没有。
BufferedReader B_br = null;
FileOutputStream output_fos = null;
OutputStreamWriter output_osw = null;
BufferedWriter output_bw = null;
try {
orders_tbl = new FileInputStream("D:" + File.separator + "bigfile" + File.separator + "A.txt");// FileInputStream
lineitem_tbl = new FileInputStream("D:" + File.separator + "bigfile" + File.separator+ "B.txt");// FileInputStream
// 从文件系统中的某个文件中获取字节
A_isr = new InputStreamReader(A);// InputStreamReader 是字节流通向字符流的桥梁
B_isr = new InputStreamReader(B);
A_br = new BufferedReader(A_isr);// 从字符输入流中读取文件中的内容,封装了一个new InputStreamReader的对象
B_br = new BufferedReader(B_isr);
output_fos = new FileOutputStream(// 输出文件位置
new File("D:" + File.separator + "bigfile" + File.separator + "output.txt"));
output_osw = new OutputStreamWriter(output_fos, "UTF-8");
output_bw = new BufferedWriter(output_osw);
String A_line = "";
String B_line ="";
while ((A_line = A_br.readLine()) != null) {
while ((B_line = B_br.readLine()) != null) {
String A_line_1[] = A_line.split("\\|", 2);
String B_line_1[] = B_line.split("\\|", 2);
if (Integer.parseInt(A_line_1[0]) < Integer.parseInt(B_line_1[0])) {
break;
}
if (Integer.parseInt(A_line_1[0]) == Integer.parseInt(B_line_1[0])) {
output_bw.write(A_line + B_line+"\n");
output_bw.flush();
}
}
}
} catch (FileNotFoundException e) {
System.out.println("找不到指定文件");
} catch (IOException e) {
System.out.println("读取文件失败");
} finally {
try {
// 注意关闭的先后顺序,先打开的后关闭,后打开的先关闭
output_bw.close();
output_osw.close();
output_fos.close();
A_br.close();
B_br.close();
A_isr.close();
B_isr.close();
A.close();
B.close();
// 关闭的时候最好按照先后顺序关闭最后开的先关闭
} catch (IOException e) {
e.printStackTrace();
}
}
}
问题:A读取一行没有问题,B读取一行,当主键相等是连接写文件,但是需要读取下一行数据才能判断不相等,会造成数据丢失。
https://blog.csdn.net/tomoya_chen/article/details/68958274
依题意,A文件主键重复,B文件主键不重复
先读取B文件,将B文件的内容存入Map对象,即Map《B文件主键, B文件内容》
遍历A文件,根据A文件主键检查Map对象中是否有匹配的B文件,如果存在则拼接内容并写入文件,否则continue;
如果存在主键重复的情况,可以对前述方案进行改进,比如A文件,将文件内容存入Map对象,即Map《A文件主键,List》
List里放文件内容
并非是最好的答案,前提是A,B主键都有序,A主键重复,所以B表在前,A表在后面。虽然Map在很小的数据量时可以解决,但是当数据量很大时会造成内存溢出。最好的解答方式是,A,B无序,并且能够处理的数据量超过内存,需要进行存储优化,和先进行排序。
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
public class TestDemo{
public static void main(String[] args) throws IOException {
long startTime = System.currentTimeMillis();
BufferedReader orders_br = null;
BufferedReader lineitem_br = null;
BufferedWriter output_bw = null;
try {
// input file
orders_br = new BufferedReader(new InputStreamReader(new FileInputStream("/root/test/2.17.3/dbgen/orders.tbl")));
lineitem_br = new BufferedReader(new InputStreamReader(new FileInputStream("/root/test/2.17.3/dbgen/lineitem.tbl")));
// output file
output_bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File("/root/test/result.tbl"))));
// table every line
String orders_line = null;
String lineitem_line = null;
// each line id
String orders_line_id = null;
String lineitem_line_id = null;
while ((orders_line = orders_br.readLine()) != null) {
orders_line_id = orders_line.substring(0, orders_line.indexOf("|"));
while (true) {
if(lineitem_line == null) {
lineitem_line = lineitem_br.readLine();
if (lineitem_line == null)
break;
}
lineitem_line_id = lineitem_line.substring(0, lineitem_line.indexOf("|"));
if (orders_line_id.equals(lineitem_line_id)) {
output_bw.write(orders_line + lineitem_line + "\n");
lineitem_line = null;
} else {
break;
}
}
}
output_bw.close();
lineitem_br.close();
orders_br.close();
} catch (FileNotFoundException e) {
System.out.println("can not find file");
} catch (IOException e) {
System.out.println("read file failure");
e.printStackTrace();
}
long endTime = System.currentTimeMillis();
long runTime = endTime - startTime;
System.out.println("Demo running time:" + runTime / 1000 + " second ");
}
}