Difference between revisions of "SAS macro %progress"

From PHUSE Wiki
Jump to: navigation, search
(Author)
(Creating a dummy page dataset)
(2 intermediate revisions by the same user not shown)
Line 4: Line 4:
  
 
== Motivation ==
 
== Motivation ==
Information will be provided after the presentation.
+
Developing a process to read back into SAS dozens of Summary Tables extracted as .txt files, and identify the various blocks of text present on each page - titles, headers (distinguishing between single column headers and multi-column spanning headers or "supra-headers"), table body, footnotes.<br />
 +
The identification of text lines corresponding to column headers can be done according to the number and maximum length of “blank fields” ( ≥2 spaces) found between non-space characters.<br />
 +
CALL PRXNEXT is used to search a line of text for matches against a regular expression (RegEx) pattern '/(?<!\s)\s{2,}(?!\s)/' that could be described as: 2 or more spaces, not immediately following a space, nor immediately followed by a space.  It loops over all matches, until no further match is found, counting the number of matches and retaining their maximum length.<br />
 +
To avoid repetitive and somewhat bulky code in a data step, the CALL PRXNEXT loops are encapsulated into 2 PROC FCMP custom functions.<br />
 +
The program has to process a growing number of pages.
  
 
== Minimum reproducible Example ==
 
== Minimum reproducible Example ==
  
 
=== Creating a dummy page dataset ===
 
=== Creating a dummy page dataset ===
The code below is shared under the MIT license: [http://opensource.org/licenses/MIT http://opensource.org/licenses/MIT]
+
The code on this page is shared under the MIT license: [http://opensource.org/licenses/MIT http://opensource.org/licenses/MIT]
 
<pre>
 
<pre>
 
*- Create a dataset with one dummy table page for demonstration, indexed by line number (l) -*;
 
*- Create a dataset with one dummy table page for demonstration, indexed by line number (l) -*;
Line 79: Line 83:
 
   stop;
 
   stop;
 
run;
 
run;
 +
</pre>
 +
 +
 +
== PROC FCMP functions ==
 +
<pre>
 +
 +
proc fcmp outlib=work.functions.prx;
 +
 +
  *- Function to return the number of times a PRX pattern has matches in a given string-*;
 +
 +
  function PRXNMATCH(pattern $, text $) ;
 +
      prx=prxparse(pattern);
 +
      start = 1;
 +
      stop = lengthn(text);
 +
      matchnum = 0;
 +
      pos = 0;
 +
      len = 0;
 +
      if (stop > 0) then do until(pos = 0);
 +
        ini=start;
 +
        call prxnext(prx, start, stop, text, pos, len);
 +
        *- CALL PRXNEXT searches a string 'text' for a pattern match (PERL regular Expression parsed as 'prx')
 +
            multiple times in succession, between the 'start' and the 'stop' positions (initially set to first
 +
            and last positions in the string).  When a match is found, the starting position 'pos' and length 'len'
 +
            of the matched substring are updated, and the 'start' for the next search is positionned at the first
 +
            character after the end of matched substring.  When no match is found, the position 'pos' and length 'len'
 +
            are set to 0, and the value of 'start' is left unchanged. -*;
 +
        if (len>0) then matchnum+1; *- count the number of matches -*;
 +
      end; 
 +
      return(matchnum); *- return the number of matches -*;
 +
  endsub;
 +
 +
      *- Function to return the maximum length of all matches a PRX pattern has in a given string-*;
 +
  function PRXMAXMATCHLEN(pattern $, text $) ;
 +
      prx = prxparse(pattern);
 +
      start = 1;
 +
      stop = lengthn(text);
 +
      pos = 0;
 +
      len = 0;
 +
      maxlen = 0;
 +
      if (stop > 0) then do until(pos = 0);
 +
        ini = start;
 +
        call prxnext(prx, start, stop, text, pos, len);
 +
        if (len>maxlen) then maxlen = len;  *- store the max length of all matches found in the current string -*;
 +
      end; 
 +
      return(maxlen); *- return the maximum length of all matches -*;
 +
  endsub;
 +
 +
run;
 +
quit;
 +
 +
option cmplib = work.functions;
 +
 +
 +
</pre>
 +
 +
== Processing various number of pages ==
 +
 +
<pre>
 +
 +
option nofullstimer nomprint;
 +
option mcompilenote = all;
 +
 +
*******************************************************************************************************************;
 +
 +
*- Define a macro to process a certain number of pages,
 +
  and report the time and number of pages processed per second when a condition is met
 +
  (by default: when the last page has been processed) -*;
 +
 +
%macro process(pages = 1, report_when = last);
 +
  data pages&pages(drop = starttime elapsed pps status)
 +
        status&pages(keep = p elapsed pps status);
 +
      retain starttime;
 +
      length status $100;
 +
      if _n_ = 1 then starttime = time();
 +
 +
      set page10000 (where = ( p <= &pages )) end = last;
 +
      by p l;
 +
      *- To help separate titles, table header, body and footnotes lines,
 +
        we will identify the number and largest size of embedded blank fields
 +
        (at least 2 consecutive spaces not preceded nor followed by any other space)
 +
        per line (excluding leading and trailing blanks) -*;
 +
      blankFieldsNum  =      PRXNMATCH('/(?<!\s)\s{2,}(?!\s)/', cats(line));  *- number of 'blank' fields of at least 2 consecutive spaces in current LINE -*;
 +
      maxBlFieldWidth = PRXMAXMATCHLEN('/(?<!\s)\s{2,}(?!\s)/', cats(line));  *- max width of all 'blank' fields of at least 2 consecutive spaces in current LINE -*;
 +
      output pages&pages;
 +
 +
      if last.p then do; *- calculate status and save it at the end of every page -*;
 +
        elapsed = time() - starttime;
 +
        if elapsed > 0 then pps = p / elapsed;
 +
        status = "Processed " || put(p, comma6.) || " pages in " || put(elapsed, time12.3) || ", i.e. " || put(pps, 8.2) || " pages per second";
 +
        output status&pages;
 +
        if &report_when
 +
            then put status;
 +
      end;
 +
      format starttime elapsed time12.3 pps 8.2;
 +
  run;
 +
%mend process;
 +
 +
option fullstimer;
 +
%process(pages =    10);
 +
%process(pages =  100);
 +
%process(pages =  1000);
 +
 +
*- report status every 100 pages until 1000, then every 500 pages -*;
 +
*
 +
%process(pages = 10000
 +
        ,report_when = (p <= 1000 and mod(p, 100) = 0) or (p > 1000 and mod(p, 500) = 0) );
 +
*- not run -*;
 
</pre>
 
</pre>
  
 
== Processing Problem ==
 
== Processing Problem ==
 +
Run times increase unexpectedly
 +
{| class="wikitable"
 +
|-
 +
| Initial run || Intermediate run || Final run
 +
|-
 +
| 150 pages || 1000 pages || 8000 pages
 +
|-
 +
| < 1 sec || < 1 min || >1h, still running!
 +
|}
 +
 +
== Solution ==
 +
=== Real-time Monitoring ===
 +
Information will be provided after the presentation.
 +
 +
=== Real-time Monitoring + Stop-and-Restart Loop ===
 
Information will be provided after the presentation.
 
Information will be provided after the presentation.
  
== Solution ==
+
== Performance ==
 
Information will be provided after the presentation.
 
Information will be provided after the presentation.
  

Revision as of 16:45, 8 November 2019

Macro %progress - Building Real-Time Feedback in the Data Step

This macro will be presented at the PhUSE EU Connect 2019 conference (Amsterdam).

Motivation

Developing a process to read back into SAS dozens of Summary Tables extracted as .txt files, and identify the various blocks of text present on each page - titles, headers (distinguishing between single column headers and multi-column spanning headers or "supra-headers"), table body, footnotes.
The identification of text lines corresponding to column headers can be done according to the number and maximum length of “blank fields” ( ≥2 spaces) found between non-space characters.
CALL PRXNEXT is used to search a line of text for matches against a regular expression (RegEx) pattern '/(?<!\s)\s{2,}(?!\s)/' that could be described as: 2 or more spaces, not immediately following a space, nor immediately followed by a space. It loops over all matches, until no further match is found, counting the number of matches and retaining their maximum length.
To avoid repetitive and somewhat bulky code in a data step, the CALL PRXNEXT loops are encapsulated into 2 PROC FCMP custom functions.
The program has to process a growing number of pages.

Minimum reproducible Example

Creating a dummy page dataset

The code on this page is shared under the MIT license: http://opensource.org/licenses/MIT

*- Create a dataset with one dummy table page for demonstration, indexed by line number (l) -*;
data page(index = (l));
      infile cards truncover;
      input @1 line $char200.;
      l = _n_;
      output;
      put l @8 line $char.;
   cards;
   SPONSOR                                                                                                       CONFIDENTIAL
   INVESTIGATIONAL NEW DRUG / Indication                                                                                Final
   ST0012
                                                           Table 2.1.1    
                                 Demographics and Baseline Characteristics by Period 2 Treatment Group
                                                    Analysis Set: Safety Set 

                                                                      Period 2 Treatment Group
                                             ___________________________________________________________________
                                                          IND 2.5mg Q4H/   IND 4.5mg    IND 2.5mg    IND 2.5mg
                                                          IND 2.5mg Q4H +     Q4H/         Q4H/         Q4H/
                                                          IND 2.5mg Q4H/   IND 4.5mg    IND 2.5mg    IND 4.5mg
                                               PBO/PBO     IND 4.5mg Q8H      Q4H          Q4H          Q8H      All Subjects
    Variable                     Statistic       N=40          N=40           N=49         N=20         N=20         N=169

    Weight (kg)
      Min - 33.3pctile           n (%)        10 (25.0)   16  (40.0)      16  (32.7)    6  (30.0)    6  (30.0)   54  (32.0)
      (39.7 - 64.8)
      >33.3pctile - 66.6pctile   n (%)        30 (75.0)   12  (30.0)      14  (28.6)    6  (30.0)   10  (50.0)   72  (42.6)
      (>64.8 - 74.1)
      >66.6pctile - max          n (%)         0          12  (30.0)      19  (38.8)    8  (40.0)    4  (20.0)   43  (25.4)
      (>74.1 - 123.0)

    Height (cm)                  n               40           40              49           20           20          169
                                 Mean           163.13       167.31          165.74       166.78       167.84       166.30
                                 SD               7.15         6.65            7.40         6.63         6.79         7.07
                                 Median         162.50       167.30          165.50       166.25       168.60       166.30
                                 Min            157.0        155.0           149.2        155.0        155.4        149.2
                                 Max            170.5        186.0           180.6        181.5        186.0        186.0

   _______________________________________________________________________________________________________________________
   BMI=Body Mass Index, BSA=Body Surface Area, IND=Investigational New Drug, IA=Interim Analysis, F=Female, M=Male, 
   Max=Maximum, Min=Minimum, PBO=Placebo, Q4H=Every 4 Hours, Q8H=Every 8 Hours, SD=Standard Deviation, pctile=Percentile.
   Note: Only data assessed during blinded treatment is presented in this table.
   References: Listing 2.1.1, Listing 2.3.1

   Program: t_dm211, 2019-09-01 at 14:24                                                                          Page x of y
;
run;

Expanding to 10,000 pages

*- duplicate the page data 10,000 times by re-reading the page dataset repeatedly -*;
data page10000(drop = start elapsed);
   label p = 'page number' l = "line number";
   put nobs=;
   start = time();
   do p = 1 to 10000; *- loop over the desired number of pages -*;
      elapsed = time()-start;
      if mod(p, 1000) = 0 then put p= elapsed= time12.3;
      do l = 1 to nobs; *- loop over each line of the page -*; 
         set page key = l nobs = nobs; *- the key option allows reading the matching observation using the dataset index -*;
         if l = nobs then substr(line, 105) = put(catx(' ', 'Page', p, 'of', 10000), $20.-r); *- replace "Page x of y" by the actual numbers -*;
         output;
      end;
   end;
   stop;
run;


PROC FCMP functions


proc fcmp outlib=work.functions.prx; 

   *- Function to return the number of times a PRX pattern has matches in a given string-*;

   function PRXNMATCH(pattern $, text $) ;
      prx=prxparse(pattern);
      start = 1;
      stop = lengthn(text);
      matchnum = 0;
      pos = 0;
      len = 0;
      if (stop > 0) then do until(pos = 0); 
         ini=start;
         call prxnext(prx, start, stop, text, pos, len); 
         *- CALL PRXNEXT searches a string 'text' for a pattern match (PERL regular Expression parsed as 'prx')
            multiple times in succession, between the 'start' and the 'stop' positions (initially set to first 
            and last positions in the string).  When a match is found, the starting position 'pos' and length 'len'
            of the matched substring are updated, and the 'start' for the next search is positionned at the first
            character after the end of matched substring.  When no match is found, the position 'pos' and length 'len'
            are set to 0, and the value of 'start' is left unchanged. -*;
         if (len>0) then matchnum+1; *- count the number of matches -*;
      end;  
      return(matchnum); *- return the number of matches -*;
   endsub;

      *- Function to return the maximum length of all matches a PRX pattern has in a given string-*;
   function PRXMAXMATCHLEN(pattern $, text $) ;
      prx = prxparse(pattern);
      start = 1;
      stop = lengthn(text);
      pos = 0;
      len = 0;
      maxlen = 0;
      if (stop > 0) then do until(pos = 0); 
         ini = start;
         call prxnext(prx, start, stop, text, pos, len);
         if (len>maxlen) then maxlen = len;  *- store the max length of all matches found in the current string -*;
      end;  
      return(maxlen); *- return the maximum length of all matches -*;
   endsub;

run;
quit;

option cmplib = work.functions;


Processing various number of pages


option nofullstimer nomprint;
option mcompilenote = all;

*******************************************************************************************************************;

*- Define a macro to process a certain number of pages, 
   and report the time and number of pages processed per second when a condition is met
   (by default: when the last page has been processed) -*;

%macro process(pages = 1, report_when = last);
   data pages&pages(drop = starttime elapsed pps status)
        status&pages(keep = p elapsed pps status);
      retain starttime;
      length status $100;
      if _n_ = 1 then starttime = time();

      set page10000 (where = ( p <= &pages )) end = last;
      by p l;
      *- To help separate titles, table header, body and footnotes lines,
         we will identify the number and largest size of embedded blank fields 
        (at least 2 consecutive spaces not preceded nor followed by any other space) 
         per line (excluding leading and trailing blanks) -*;
      blankFieldsNum  =      PRXNMATCH('/(?<!\s)\s{2,}(?!\s)/', cats(line));  *- number of 'blank' fields of at least 2 consecutive spaces in current LINE -*;
      maxBlFieldWidth = PRXMAXMATCHLEN('/(?<!\s)\s{2,}(?!\s)/', cats(line));  *- max width of all 'blank' fields of at least 2 consecutive spaces in current LINE -*;
      output pages&pages;

      if last.p then do; *- calculate status and save it at the end of every page -*;
         elapsed = time() - starttime;
         if elapsed > 0 then pps = p / elapsed;
         status = "Processed " || put(p, comma6.) || " pages in " || put(elapsed, time12.3) || ", i.e. " || put(pps, 8.2) || " pages per second";
         output status&pages;
         if &report_when 
            then put status;
      end;
      format starttime elapsed time12.3 pps 8.2;
   run;
%mend process;

option fullstimer;
%process(pages =    10);
%process(pages =   100);
%process(pages =  1000);

*- report status every 100 pages until 1000, then every 500 pages -*;
*
%process(pages = 10000
        ,report_when = (p <= 1000 and mod(p, 100) = 0) or (p > 1000 and mod(p, 500) = 0) );
 *- not run -*;

Processing Problem

Run times increase unexpectedly

Initial run Intermediate run Final run
150 pages 1000 pages 8000 pages
< 1 sec < 1 min >1h, still running!

Solution

Real-time Monitoring

Information will be provided after the presentation.

Real-time Monitoring + Stop-and-Restart Loop

Information will be provided after the presentation.

Performance

Information will be provided after the presentation.

Root Cause

Information will be provided after the presentation.

Optimization

Information will be provided after the presentation.

Conclusions

Information will be provided after the presentation.

Author

Jean-Michel Bodart
Business & Decision Life Sciences
Rue Saint-Lambert 141
1200 Brussels
Belgium
http://www.businessdecision-lifesciences.com
--Jmbodart (talk) 10:11, 20 September 2019 (EDT)