Changeset - d6fbd191da57
[Not reviewed]
default
0 1 0
Nathan Brink (binki) - 13 years ago 2012-04-26 01:33:07
ohnobinki@ohnopublishing.net
Remove unnecessary CONSTITUENCY GET parameter from the Calvin College crawler.
1 file changed with 6 insertions and 6 deletions:
0 comments (0 inline, 0 general)
school.d/calvin.crawl.inc
Show inline comments
 
@@ -40,28 +40,28 @@ function calvin_crawl_semester_list(arra
 
		      /* I don't know if SU is a valid Calvin Semester ID or not */
 
		      'SU' => Semester::SEASON_SUMMER);
 

	
 
  /**
 
   * The first link we start at is the one from KV into WebAdvisor.
 
   *
 
   * 1. https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL
 
   * 1. https://kvdata.calvin.edu/walive/WebAdvisor?type=P&pid=ST-WESTS12A&LASTTOKEN=NULL
 
   *    <body onload="javascript:getWindowHTML();">
 
   *
 
   *    Calls javascript:getWindowHTML(). This merely adds
 
   *    TOKENIDX=NULL to the query string, so we can skip this step
 
   *    and just have TOKENIDX=NULL.
 
   *
 
   * 2. https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL&TOKENIDX=NULL
 
   * 2. https://kvdata.calvin.edu/walive/WebAdvisor?type=P&pid=ST-WESTS12A&LASTTOKEN=NULL&TOKENIDX=NULL
 
   *    <body onload="javascript:setWindowHTML('', '7699844013');">
 
   *
 
   *    In the above, the second argument to setWindowHTML() is
 
   *    random. Thus, we have to capture this value.
 
   */
 

	
 
  $cookies = array();
 
  $uri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL';
 
  $uri = 'https://kvdata.calvin.edu/walive/WebAdvisor?type=P&pid=ST-WESTS12A&LASTTOKEN=NULL';
 
  $semesters_html = calvin_crawl_geturi($uri, $cookies, $school_crawl_log);
 

	
 
  $semesters_dom = new DOMDocument();
 
  $semesters_dom->loadHTML($semesters_html);
 

	
 
  /*
 
@@ -112,13 +112,13 @@ function calvin_crawl_semester_list(arra
 
 * \param $school_crawl_log
 
 *   The logger handle.
 
 */
 
function calvin_crawl_semester(array $school, Semester $semester, &$school_crawl_log)
 
{
 
  $cookies = array();
 
  $uri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL';
 
  $uri = 'https://kvdata.calvin.edu/walive/WebAdvisor?type=P&pid=ST-WESTS12A&LASTTOKEN=NULL';
 
  $html = calvin_crawl_geturi($uri, $cookies, $school_crawl_log);
 
  $seed_dom = new DOMDocument();
 
  $seed_dom->loadHTML($html);
 
  $return_url = dom_input_value($seed_dom, 'RETURN.URL');
 

	
 
  /*
 
@@ -556,15 +556,15 @@ function calvin_crawl_geturi(&$uri, arra
 
   * 'CLONE' and 'FORCEIDX'. Then it appends TOKENIDX=<token> to the
 
   * query parameters.
 
   *
 
   * Example, where TOKENIDX does not start out as NULL but where a
 
   * CLONE=Y command is being sent:
 
   *
 
   * Input: HTTPS://kvdata.calvin.edu/walive/WebAdvisor?TYPE=P&PID=ST-WESTS13C&CLONE=Y&CLONE_PROCESS=Y&SPAUKQ=708501792841963&CONSTITUENCY=WBST&TOKENIDX=1507971558
 
   * Input: HTTPS://kvdata.calvin.edu/walive/WebAdvisor?TYPE=P&PID=ST-WESTS13C&CLONE=Y&CLONE_PROCESS=Y&SPAUKQ=708501792841963&TOKENIDX=1507971558
 
   *
 
   * Result: HTTPS://kvdata.calvin.edu/walive/WebAdvisor?TYPE=P&PID=ST-WESTS13C&CLONE_PROCESS=Y&SPAUKQ=708501792841963&CONSTITUENCY=WBST&TOKENIDX=2281086932
 
   * Result: HTTPS://kvdata.calvin.edu/walive/WebAdvisor?TYPE=P&PID=ST-WESTS13C&CLONE_PROCESS=Y&SPAUKQ=708501792841963&TOKENIDX=2281086932
 
   */
 
  $uri = preg_replace('/([?&])TOKENIDX=[^&]+/', '$1TOKENIDX=' . $token,
 
		      preg_replace('/([?&])(CLONE|FORCEIDX)=[^&]+&?/', '$1', $uri));
 

	
 
  return calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log));
 
}
0 comments (0 inline, 0 general)