## Copyright (C) 2013,2014 Markus Bergholz
## Parts Copyright (C) 2013,2014 Philip Nienhuis
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Octave; see the file COPYING. If not, see
## .
## -*- texinfo -*-
## @deftypefn {Function File} [ @var{raw}, @var{xls}, @var rstatus} ] = __OCT_xlsx2oct__ (@var{xlsx}, @var{wsh}, @var{range}, @spsh_opts)
## Internal function for reading data from an xlsx worksheet
##
## @seealso{}
## @end deftypefn
## Author: Markus Bergholz
## Created: 2013-10-04
## Updates:
## 2010-10-20 Transplanted & adapted section for text string worksheet names (PRN)
## '' Much code restyled into Octave coding conventions
## 2013-10-28 More fixes by Markus
## 2013-11-02 (PRN) Added rstatus return arg (needed by xlsread.m)
## 2013-11-04 (PRN) Adapted regexp search strings to include (numeric) formulas and booleans
## '' (PRN) Commented out code for only numeric data until contiguousness is checked
## 2013-11-08 (PRN) Fix reading date/time
## '' Rework code to keep cell array, not a numeric matrix
## '' Add reading formulas
## 2013-11-09 (PRN) Add reading strings & formulas
## '' Rearrange code
## '' Prepare for fast reading (still uncommented)
## '' Implement selected range (still rough ATM but for devs the easiest)
## 2013-11-10 (PRN) Fix typo preventing reading named worksheets instead of indices
## 2013-11-13 (PRN) Pretty text output
## 2013-11-15 (PRN) Catch empty sharedString.xml (means no fixed strings)
## 2013-11-16 (PRN) Replace fgetl calls by fread to cope with EOLs
## 2013-12-12 (PRN) Adapted regular expressions for formulas_as_text to cope with POI
## '' Adapted regular expression for shared strings to cope with empty strings
## 2013-12-14 (PRN) Adapt regexpr for values (include "n" as value type)
## '' Fix regexpr for strings ( \d? => \d+ )
## '' Add isfinite() check before attempt to process fixed strings
## 2013-12-19 (MB) Replace call to __col_str_to_number with __OCT_cc__
## 2014-03-18 (PRN) Fix regexp for reading strings from sharedStrings.xml
## 2014-04-26 Replace __OCT_cc__ by binary __char2num__
function [ raw, xls, rstatus ] = __OCT_xlsx2oct__ (xls, wsh, crange='', spsh_opts)
## spsh_opts is guaranteed to be filled by caller
## If a worksheet if given, check if it's given by a name (string) or a number
if (ischar (wsh))
# fid = fopen (sprintf ('%s/xl/workbook.xml', xls.workbook));
# if (fid < 0)
## File open error
# error ("xls2oct: file %s couldn't be opened for reading", filename);
# else
# xml = fread (fid, "char=>char").';;
## Close file
# fclose (fid);
## Search for requested sheet name
id = strmatch (wsh, xls.sheets.sh_names);
if (isempty (id))
error ("xls2oct: cannot find sheet '%s' in file %s", wsh, xls.filename);
else
wsh = xls.sheets.sheetid(id);
endif
# endif
elseif (wsh > numel (xls.sheets.sh_names))
error ("xls2oct: worksheet number %d > number of worksheets in file (%d)", wsh, numel (xls.sheets.sh_names));
elseif (wsh < 1)
warning ("xls2oct: illegal worksheet number (%d) - worksheet #1 assumed", wsh);
endif
## Prepare to open requested worksheet file in subdir xl/ . Note: Win accepts forward slashes
rawsheet = fopen (sprintf ('%s/xl/worksheets/sheet%d.xml', xls.workbook, wsh));
if (rawsheet > 0)
## Get data
rawdata = fread (rawsheet, "char=>char").';
fclose (rawsheet);
## Strings
try
fid = fopen (sprintf ("%s/xl/sharedStrings.xml", xls.workbook));
strings = fread (fid, "char=>char").';
fclose (fid);
catch
## No sharedStrings.xml; implies no "fixed" strings (computed strings can still be there)
strings = "";
end_try_catch
else
error ("Couldn't open worksheet xml file sheet%d.xml\n", wsh);
endif
rstatus = 0;
## Check if there are only numeric data (then no s="" or r="s")
## FIXME contiguous data rectangles check needed!! so corresponding else clause below
##if (! (numel (strfind (rawdata, ' s="')) > 0 || numel (strfind (rawdata, '')) > 0 || numel (strfind (rawdata, 't="s"')) > 0))
## if no s="...", nor , nor t="s" is found. good luck - it could be awesome fast (like example.xlsx)!
## FIXME This will break on the reshape below if the data are non-contiguous. Numeric formula results are skipped too
raw = {};
##val = cell2mat (regexp (rawdata, '(.*?)', "tokens"));
##valraw = [];
##FIXME This reshape will break on non-contguous data
##try
## matrows = size (strfind (rawdata, ''), 2)
## raw = reshape (val, [], matrows)';
## raw = num2cell (raw);
## ## Reshape didn't complain. Big chance we have contiguous data.
## topleft = cell2mat (regexp (rawdata, '.*?', "tokens", "once"));
## [~, ~, ~, trow, lcol] = parse_sp_range (topleft);
## [nrows, ncols] = size (raw);
## xls.limits = [lcol, lcol + ncols - 1; trow, trow + nrows - 1];
## rstatus = 1;
## return
##catch
## raw = {};
##end_try_catch
## 'val' are the actual values. 'valraw' are the corresponding(!) cell positions (e.g. B3).
if (isempty (raw))
## General note for tuning: '"([^"]*)"' (w/o single quotes) could be faster than '"(.*?)"'
## (http://stackoverflow.com/questions/2503413/regular-expression-to-stop-at-first-match comment #7)
## Below are loads of nested IFs. They're needed to catch empty previous results, even empty sheets
## 1. Get pure numbers, including booleans, double and boolean formula results, from cells w/o 's=""' tag
val = cell2mat (regexp (rawdata, '(?:|/>))?(.*?)', "tokens"));
if (! isempty (val))
valraw = cell2mat (regexp (rawdata, '(?:|/>))?.*?', "tokens"));
endif
## If val is still empty, try another regexpression (PRN: will this ever work? haven't seen such cells)
if (numel (val) == 0)
val = cell2mat (regexp (rawdata, '(.*?)', "tokens"));
if (! isempty (val))
valraw = cell2mat (regexp (rawdata, '.*?', "tokens"));
endif
endif
## If 'val' exist, check if there are tagged s="NUMBERS" values
if (numel (regexp (rawdata, ' s="', "once")) > 0)
## Time/date values. Exclude formulas (having of tags),
## strings ('t="s"') and error results ('t="e"')
valp = cell2mat (regexp (rawdata, '(.*?)', "tokens"));
if (! isempty (valp))
valrawp = cell2mat(regexp (rawdata, '.*?', "tokens"));
if (! isempty (val))
val = [val valp];
valraw = [valraw valrawp];
else
val = valp;
valraw = valrawp;
clear valp valrawp ;
endif
endif
endif
## Turn strings into numbers
if (! isempty (val))
val = num2cell (str2double (val));
endif
## 2. String / text formulas (cached results are in this sheet; fixed strings in )
## Formulas
if (spsh_opts.formulas_as_text)
## Get formulas themselves as text strings. Formulas have no 't="s"' attribute. Keep starting '>' for next line
valf1 = cell2mat (regexp (rawdata, '.*?)(?:.*?)?', "tokens"));
if (! isempty (valf1))
valf1 = regexprep (valf1, '^>', '=');
## Pretty text output
valf1 = strrep (valf1, """, '"');
valf1 = strrep (valf1, "<", "<");
valf1 = strrep (valf1, ">", ">");
valf1 = strrep (valf1, "&", "&");
valrawf1 = cell2mat(regexp (rawdata, '.*?(?:.*?)?', "tokens"));
if (isempty (val))
val = valf1;
else
## Formulas start with '=' so:
val = [val valf1];
valraw = [valraw valrawf1];
endif
endif
clear valf1 valrawf1 ;
else
## Get (cached) formula results. Watch out! as soon as a "t" attibute equals "b" or is missing it is a number
## First the non-numeric formula results
valf2 = cell2mat (regexp (rawdata, '(?:|/>))(.*?)', "tokens"));
if (! isempty (valf2))
## Pretty text output
valf2 = strrep (valf2, """, '"');
valf2 = strrep (valf2, "<", "<");
valf2 = strrep (valf2, ">", ">");
valf2 = strrep (valf2, "&", "&");
valrawf2 = cell2mat(regexp (rawdata, '(?:|/>)).*?', "tokens"));
if (isempty (val))
val = valf2;
valraw = valrawf2;
else
val = [val valf2];
valraw = [valraw valrawf2];
endif
endif
clear valf2 valrawf2 ;
## Next the numeric formula results. These need additional conversion
valf3 = cell2mat (regexp (rawdata, '(?:|/>))(.*?)', "tokens"));
if (! isempty (valf3))
valrawf3 = cell2mat(regexp (rawdata, '(?:|/>)).*?', "tokens"));
if (isempty (val))
val = num2cell(str2double (valf3));
valraw = valrawf3;
else
val = [val num2cell(str2double (valf3))];
valraw = [valraw valrawf3];
endif
endif
clear valf3 valrawf3 ;
endif
## 3. Strings
if (! isempty (strings))
## Extract string values. May be much more than present in current sheet
strings=regexp(strings,']*>.*?','match');
for n = 1:columns(strings)
ctext{1,n}=cell2mat(cell2mat(regexp(strings{1,n},']*>(.*?)','tokens')));
end
## Pointers into sharedStrings.xml. "Hard" (fixed) strings have 't="s"' attribute
## For reasons known only to M$ those pointers are zero-based, so:
vals = str2double (cell2mat (regexp (rawdata, '(\d+)', "tokens"))) + 1;
if (! isempty (vals) && isfinite (vals))
## Get actual values
vals = ctext(vals);
## Pretty text output
vals = strrep (vals, """, '"');
vals = strrep (vals, "<", "<");
vals = strrep (vals, ">", ">");
vals = strrep (vals, "&", "&");
## Cell addresses
valraws = cell2mat (regexp (rawdata, '\d+', "tokens"));
if (isempty (val))
val = vals;
valraw = valraws;
else
val = [val vals];
valraw = [valraw valraws];
endif
endif
clear vals valraws ;
endif
## If val is empty, sheet is empty
if (isempty (val))
xls.limits = [];
raw = {};
return
endif
## 4. Prepare
## Get the row number (currently supported from 1 to 999999)
vi.row = str2double (cell2mat (regexp (valraw, '(\d+|\d+\d+|\d+\d+\d+|\d+\d+\d+\d+|\d+\d+\d+\d+\+d|\d+\d+\d+\d+\d+\d+)?', "match"))')';
## Get the column character (A to ZZZ) (that are more than 18k supported columns atm)
vi.alph = cell2mat (regexp (valraw, '([A-Za-z]+|[A-Za-z]+[A-Za-z]+|[A-Za-z]+[A-Za-z]+[A-Za-z]+)?', "match"));
## Free memory; might be useful while reading huge files
clear valraw ;
## If missed formular indices
idx.all = cell2mat (regexp (rawdata, ']*> 1; C -> 3, AB -> 28 ...
vi.col = double (cell2mat (cellfun (@__char2num__, vi.alph, "UniformOutput", 0)));
## Find data rectangle limits
idx.mincol = min ([idx.alph vi.col]);
idx.minrow = min ([idx.num vi.row]);
idx.maxrow = max ([idx.num vi.row]);
idx.maxcol = max ([idx.alph vi.col]);
## Convey limits of data rectangle to xls2oct. Must be done here
xls.limits = [idx.mincol, idx.maxcol; idx.minrow, idx.maxrow];
## column adjustment when first number or formula don't begin in the first column
if (1 < idx.mincol)
vi.col = vi.col - (idx.mincol - 1);
endif
## row adjustment when first number or formular don't begin in the first row
if (1 < idx.minrow)
vi.row = vi.row - (idx.minrow - 1);
endif
## Initialize output cell array
raw = cell (idx.maxrow - idx.minrow + 1, idx.maxcol - idx.mincol + 1);
## get logical indices for 'val' from 'valraw' positions in NaN matrix
vi.idx = sub2ind (size (raw), (vi.row), (vi.col));
## set values to the corresponding indizes in final cell matrix
raw(vi.idx) = val;
endif
## FIXME maybe reading parts of the data can be done faster above by better regexps
if (! isempty (crange))
## We'll do it the easy way: just read all data, then return only the requested part
[~, nr, nc, tr, lc] = parse_sp_range (crange);
xls.limits = [max(idx.mincol, lc), min(idx.maxcol, lc+nc-1); max(idx.minrow, tr), min(idx.maxrow, tr+nr-1)];
## Correct spreadsheet locations for lower right shift or raw
rc = idx.minrow - 1;
cc = idx.mincol - 1;
raw = raw(xls.limits(2, 1)-rc : xls.limits(2, 2)-rc, xls.limits(1, 1)-cc : xls.limits(1, 2)-cc);
endif
if (! isempty (val))
rstatus = 1;
endif
endfunction