/ utilities to quickly load a csv file - for more exhaustive analysis of the csv contents see csvguess.q / 2007.10.20 - updated to match latest csvguess.q / .csv.colhdrs[file] - return a list of colhdrs from file / info:.csv.info[file] - return a table of information about the file / columns are: / c - column name; ci - column index; t - load type; mw - max width; / dchar - distinct characters in values; rule - rule that caught the type / maybe - needs checking, _could_ be say a date, but perhaps just a float? / .csv.info0[file;onlycols] - like .csv.info except that it only analyses / example: / info:.csv.info0[file;(.csv.colhdrs file)like"*price"] / info:.csv.infolike[file;"*price"] / show delete from info where t=" " / .csv.data[file;info] - use the info from .csv.info to read the data / .csv.data10[file;info] - like .csv.data but only returns the first 10 rows / bulkload[file;info] - bulk loads file into table DATA (which must be already defined :: DATA:() ) / .csv.read[file]/read10[file] - for when you don't care about checking/tweaking the before reading \d .csv DELIM:"," ZAPHDRS:0b / lowercase and remove _ from colhdrs (junk characters are always removed) WIDTHHDR:25000 / number of characters read to get the header READLINES:222 / number of lines read and used to guess the types SYMMAXWIDTH:11 / character columns narrower than this are stored as symbols SYMMAXGR:10 / max symbol granularity% before we give up and keep as a * string FORCECHARWIDTH:30 / every field (of any type) with values this wide or more is forced to character "*" DISCARDEMPTY:0b / completely ignore empty columns if true else set them to "C" k)nameltrim:{$[~@x;.z.s'x;~(*x)in aA:.Q.a,.Q.A;(+/&\~x in aA)_x;x]} cleanhdrs:{{$[ZAPHDRS;lower x except"_";x]}x where x in DELIM,.Q.an} cancast:{nw:x$"";if[not x in"BXCS";nw:(min 0#;max 0#;::)@\:nw];$[not any nw in x$(11&count y)#y;$[11.csv.FORCECHARWIDTH; / long values info:update t:"C "[.csv.DISCARDEMPTY],rule:30,empty:0b from info where t="?",mw=0; / empty columns info:update dchar:{asc distinct raze x}peach sdv from info where t="?"; info:update mdot:{max sum each"."=x}peach sdv from info where t="?",{"."in x}each dchar; info:update t:"n",rule:40 from info where t="?",{$[any x in"0123456789";all x in".-+eE0123456789/: ";0b]}each dchar; / vaguely numeric.. info:update t:"I",rule:50,ipa:1b from info where t="n",mw within 7 15,mdot=3,{all x in".0123456789"}each dchar; / ip-address info:update t:"J",rule:60 from info where t="n",mdot=0,{all x in"+-0123456789"}each dchar,.csv.cancast["J"]peach sdv; info:update t:"I",rule:70 from info where t="J",mw<12,.csv.cancast["I"]peach sdv; info:update t:"H",rule:80 from info where t="I",mw<7,.csv.cancast["H"]peach sdv; info:update t:"F",rule:90,maybe:0b from info where t="n",mdot<2,mw>1,.csv.cancast["F"]peach sdv; info:update t:"E",rule:100,maybe:0b from info where t="F",mw<9,{all x in".+-0123456789"}each dchar; / M [yy]yymm yyyy[?]mm info:update t:"M",rule:110,maybe:1b from info where t="I",mw=6,.csv.cancast["M"]peach sdv; / 200506, YYYYMM is less likely than [H]HMMSS so do that first info:update t:"M",rule:120,maybe:1b from info where t="H",mw=4,.csv.cancast["M"]peach sdv,{not all(value each x)within 1960 2035}peach sdv; / 0506, YYMM is less likely than [H]HMM so do that first info:update t:"M",rule:130,maybe:0b from info where t in"?n",mw=7,{all x like"[12][0-9][0-9][0-9]?[01][0-9]"}peach sdv,.csv.cancast["M"]peach sdv; / 2005?06, YYYY?MM info:update t:"M",rule:140,maybe:1b from info where t in"EF",mw=7,{all x like"[12][0-9][0-9][0-9].[01][0-9]"}peach sdv,.csv.cancast["M"]peach sdv; / 2005.06, YYYY.MM info:update t:"V",rule:150,maybe:1b from info where t="I",mw in 5 6,71,gr<.csv.SYMMAXGR; / symbols (max width permitting) info:update t:"*",rule:310,maybe:0b from info where t="?"; / the rest as strings info:update maybe:1b from info where mw>4,not t="D",(lower c)like"*date*"; info:update maybe:1b from info where mw>1,not t in"TUV",(lower c)like"*time*"; / flag those S/* columns which could be encoded to integers (.Q.j10/x10/j12/x12) to avoid symbols info:update j10:0b,j12:0b from info; info:update j12:1b from info where t in"S*",mw<13,{all x in .Q.nA}each dchar; info:update j10:1b from info where t in"S*",mw<11,{all x in .Q.b6}each dchar; select c,ci,t,maybe,empty,res,j10,j12,ipa,mw,mdot,rule,gr,ndv,dchar from info} info:info0[;()] / by default don't restrict columns infolike:{[file;pattern] info0[file;{x where x like y}[lower colhdrs[file];pattern]]} / .csv.infolike[file;"*time"] \d . / DATA:() bulkload:{[file;info] if[not`DATA in system"v";'`DATA.not.defined]; if[count DATA;'`DATA.not.empty]; loadhdrs:exec c from info where not t=" ";loadfmts:exec t from info; .Q.fs[{[file;loadhdrs;loadfmts] `DATA insert $[count DATA;flip loadhdrs!(loadfmts;.csv.DELIM)0:file;loadhdrs xcol(loadfmts;enlist .csv.DELIM)0:file]}[file;loadhdrs;loadfmts]]; count DATA} @[.:;"\\l csvutil.custom.q";::]; / save your custom settings in csvutil.custom.q to override those set at the beginning of the file