Splitting and Merging large files (size in GB) in Java -
suppose,
i splitting 2590400 kb (approx 2.5 gb) file in 30 parts.
it produce 30 files size of 86347 kb.
seems correct, 2590400/30 = 86346.66666667now if merge parts (30) again producing file of 3453873 kb file, should 2590410 kb.
can me why difference there? using below code merge , split files.
splitfile.java
import java.io.bufferedoutputstream; import java.io.bufferedreader; import java.io.bufferedwriter; import java.io.file; import java.io.fileinputstream; import java.io.fileoutputstream; import java.io.filewriter; import java.io.ioexception; import java.io.inputstreamreader; import java.io.randomaccessfile; /** * @author vishal.zanzrukia * */ public class splitfile { public static final string input_file = "d:\\me\\projects\\input\\file\\path.txt"; public static final int number_of_output_files = 30; public static final string file_suffix = ".txt"; /** * split file * * @throws exception */ static void splitfile() throws exception{ file inputfile = new file(input_file + "_splits"); inputfile.mkdir(); randomaccessfile raf = new randomaccessfile(input_file, "r"); long sourcesize = raf.length(); long bytespersplit = sourcesize / number_of_output_files; long remainingbytes = sourcesize % number_of_output_files; int maxreadbuffersize = 8 * 1024; // 8kb (int destix = 1; destix <= number_of_output_files; destix++) { bufferedoutputstream bw = new bufferedoutputstream(new fileoutputstream(input_file + "_splits\\split." + destix + file_suffix)); if (bytespersplit > maxreadbuffersize) { long numreads = bytespersplit / maxreadbuffersize; long numremainingread = bytespersplit % maxreadbuffersize; (int = 0; < numreads; i++) { readwrite(raf, bw, maxreadbuffersize); } if (numremainingread > 0) { readwrite(raf, bw, numremainingread); } } else { readwrite(raf, bw, bytespersplit); } bw.close(); } if (remainingbytes > 0) { bufferedoutputstream bw = new bufferedoutputstream(new fileoutputstream("split." + number_of_output_files + 1)); readwrite(raf, bw, remainingbytes); bw.close(); } raf.close(); } /** * join file * * @throws exception */ static void joinfiles() throws exception{ int maxreadbuffersize = 8 * 1024; bufferedoutputstream bw = new bufferedoutputstream(new fileoutputstream(input_file + "_splits\\fulljoin" + file_suffix)); file inputfiledir = new file(input_file + "_splits"); randomaccessfile raf = null; if(inputfiledir.isdirectory()){ for(file file : inputfiledir.listfiles()){ raf = new randomaccessfile(file, "r"); long numreads = raf.length() / maxreadbuffersize; long numremainingread = raf.length() % maxreadbuffersize; (int = 0; < numreads; i++) { readwrite(raf, bw, maxreadbuffersize); } if (numremainingread > 0) { readwrite(raf, bw, numremainingread); } raf.close(); } } bw.close(); } public static void mergefiles() { file[] files = new file[number_of_output_files]; for(int i=1;i<=number_of_output_files;i++){ files[i-1] = new file(input_file + "_splits\\split."+i+file_suffix); } string mergedfilepath = input_file + "_splits\\fulljoin" + file_suffix; file mergedfile = new file(mergedfilepath); mergefiles(files, mergedfile); } public static void mergefiles(file[] files, file mergedfile) { filewriter fstream = null; bufferedwriter out = null; try { fstream = new filewriter(mergedfile, true); out = new bufferedwriter(fstream); } catch (ioexception e1) { e1.printstacktrace(); } (file f : files) { system.out.println("merging: " + f.getname()); fileinputstream fis; try { fis = new fileinputstream(f); bufferedreader in = new bufferedreader(new inputstreamreader(fis)); string aline; while ((aline = in.readline()) != null) { out.write(aline); out.newline(); } in.close(); } catch (ioexception e) { e.printstacktrace(); } } try { out.close(); } catch (ioexception e) { e.printstacktrace(); } } public static void main(string[] args) throws exception { // splitfile(); mergefiles(); } static void readwrite(randomaccessfile raf, bufferedoutputstream bw, long numbytes) throws ioexception { byte[] buf = new byte[(int) numbytes]; int val = raf.read(buf); if (val != -1) { bw.write(buf); } } }
use joinfiles
method: don't try read file line-by-line using reader
if want keep was, because line endings may differ platform.
instead read them binary file using inputstream
or randomaccessfile
, write using outputstream
.
the problem in joinfiles
method used file.listfiles()
makes no guarantees order in files returned.
i combined mergefiles()
code joinfiles()
make work (remember invoke joinfiles()
instead of mergefiles()
main
method)
static void joinfiles(file[] files) throws exception { int maxreadbuffersize = 8 * 1024; bufferedoutputstream bw = new bufferedoutputstream(new fileoutputstream(input_file + "_splits\\fulljoin" + file_suffix)); randomaccessfile raf = null; (file file : files) { raf = new randomaccessfile(file, "r"); long numreads = raf.length() / maxreadbuffersize; long numremainingread = raf.length() % maxreadbuffersize; (int = 0; < numreads; i++) { readwrite(raf, bw, maxreadbuffersize); } if (numremainingread > 0) { readwrite(raf, bw, numremainingread); } raf.close(); } bw.close(); } public static void joinfiles() throws exception { file[] files = new file[number_of_output_files]; (int = 1; <= number_of_output_files; i++) { files[i - 1] = new file(input_file + "_splits\\split." + + file_suffix); } joinfiles(files); }
Comments
Post a Comment