]> scripts.mit.edu Git - autoinstallsdev/mediawiki.git/blob - maintenance/backupPrefetch.inc
MediaWiki 1.11.0
[autoinstallsdev/mediawiki.git] / maintenance / backupPrefetch.inc
1 <?php
2
3 // Some smart guy removed XMLReader's global constants from PHP 5.1
4 // and replaced them with class constants. Breaking source compatibility
5 // is SUPER awesome, and I love languages which do this constantly!
6 $xmlReaderConstants = array(
7         "NONE",
8         "ELEMENT",
9         "ATTRIBUTE", 
10         "TEXT",
11         "CDATA",
12         "ENTITY_REF",
13         "ENTITY",
14         "PI",
15         "COMMENT",
16         "DOC",
17         "DOC_TYPE",
18         "DOC_FRAGMENT",
19         "NOTATION",
20         "WHITESPACE",
21         "SIGNIFICANT_WHITESPACE",
22         "END_ELEMENT",
23         "END_ENTITY",
24         "XML_DECLARATION",
25         "LOADDTD",
26         "DEFAULTATTRS",
27         "VALIDATE",
28         "SUBST_ENTITIES" );
29 foreach( $xmlReaderConstants as $name ) {
30         $fullName = "XMLREADER_$name";
31         $newName = "XMLReader::$name";
32         if( !defined( $fullName ) ) {
33                 if( defined( $newName ) ) {
34                         define( $fullName, constant( $newName ) );
35                 } else {
36                         // broken or missing the extension...
37                 }
38         }
39 }
40
41 /**
42  * Readahead helper for making large MediaWiki data dumps;
43  * reads in a previous XML dump to sequentially prefetch text
44  * records already normalized and decompressed.
45  *
46  * This can save load on the external database servers, hopefully.
47  *
48  * Assumes that dumps will be recorded in the canonical order:
49  * - ascending by page_id
50  * - ascending by rev_id within each page
51  * - text contents are immutable and should not change once
52  *   recorded, so the previous dump is a reliable source
53  *
54  * Requires PHP 5 and the XMLReader PECL extension.
55  */
56 class BaseDump {
57         var $reader = null;
58         var $atEnd = false;
59         var $atPageEnd = false;
60         var $lastPage = 0;
61         var $lastRev = 0;
62
63         function BaseDump( $infile ) {
64                 $this->reader = new XMLReader();
65                 $this->reader->open( $infile );
66         }
67
68         /**
69          * Attempts to fetch the text of a particular page revision
70          * from the dump stream. May return null if the page is
71          * unavailable.
72          *
73          * @param int $page ID number of page to read
74          * @param int $rev ID number of revision to read
75          * @return string or null
76          */
77         function prefetch( $page, $rev ) {
78                 $page = intval( $page );
79                 $rev = intval( $rev );
80                 while( $this->lastPage < $page && !$this->atEnd ) {
81                         $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
82                         $this->nextPage();
83                 }
84                 if( $this->lastPage > $page || $this->atEnd ) {
85                         $this->debug( "BaseDump::prefetch already past page $page looking for rev $rev  [$this->lastPage, $this->lastRev]" );
86                         return null;
87                 }
88                 while( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
89                         $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, looking for $page, $rev" );
90                         $this->nextRev();
91                 }
92                 if( $this->lastRev == $rev && !$this->atEnd ) {
93                         $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
94                         return $this->nextText();
95                 } else {
96                         $this->debug( "BaseDump::prefetch already past rev $rev on page $page  [$this->lastPage, $this->lastRev]" );
97                         return null;
98                 }
99         }
100
101         function debug( $str ) {
102                 wfDebug( $str . "\n" );
103                 //global $dumper;
104                 //$dumper->progress( $str );
105         }
106
107         /**
108          * @access private
109          */
110         function nextPage() {
111                 if( $this->skipTo( 'page', 'mediawiki' ) ) {
112                         if( $this->skipTo( 'id' ) ) {
113                                 $this->lastPage = intval( $this->nodeContents() );
114                                 $this->lastRev = 0;
115                                 $this->atPageEnd = false;
116                         }
117                 } else {
118                         $this->atEnd = true;
119                 }
120         }
121
122         /**
123          * @access private
124          */
125         function nextRev() {
126                 if( $this->skipTo( 'revision' ) ) {
127                         if( $this->skipTo( 'id' ) ) {
128                                 $this->lastRev = intval( $this->nodeContents() );
129                         }
130                 } else {
131                         $this->atPageEnd = true;
132                 }
133         }
134
135         /**
136          * @access private
137          */
138         function nextText() {
139                 $this->skipTo( 'text' );
140                 return strval( $this->nodeContents() );
141         }
142
143         /**
144          * @access private
145          */
146         function skipTo( $name, $parent='page' ) {
147                 if( $this->atEnd ) {
148                         return false;
149                 }
150                 while( $this->reader->read() ) {
151                         if( $this->reader->nodeType == XMLREADER_ELEMENT &&
152                                 $this->reader->name == $name ) {
153                                 return true;
154                         }
155                         if( $this->reader->nodeType == XMLREADER_END_ELEMENT &&
156                                 $this->reader->name == $parent ) {
157                                 $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
158                                 return false;
159                         }
160                 }
161                 return $this->close();
162         }
163
164         /**
165          * Shouldn't something like this be built-in to XMLReader?
166          * Fetches text contents of the current element, assuming
167          * no sub-elements or such scary things.
168          * @return string
169          * @access private
170          */
171         function nodeContents() {
172                 if( $this->atEnd ) {
173                         return null;
174                 }
175                 if( $this->reader->isEmptyElement ) {
176                         return "";
177                 }
178                 $buffer = "";
179                 while( $this->reader->read() ) {
180                         switch( $this->reader->nodeType ) {
181                         case XMLREADER_TEXT:
182 //                      case XMLREADER_WHITESPACE:
183                         case XMLREADER_SIGNIFICANT_WHITESPACE:
184                                 $buffer .= $this->reader->value;
185                                 break;
186                         case XMLREADER_END_ELEMENT:
187                                 return $buffer;
188                         }
189                 }
190                 return $this->close();
191         }
192
193         /**
194          * @access private
195          */
196         function close() {
197                 $this->reader->close();
198                 $this->atEnd = true;
199                 return null;
200         }
201 }
202
203 ?>