]> scripts.mit.edu Git - autoinstalls/mediawiki.git/blob - maintenance/backupPrefetch.inc
MediaWiki 1.30.2-scripts
[autoinstalls/mediawiki.git] / maintenance / backupPrefetch.inc
1 <?php
2 /**
3  * Helper class for the --prefetch option of dumpTextPass.php
4  *
5  * Copyright © 2005 Brion Vibber <brion@pobox.com>
6  * https://www.mediawiki.org/
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License along
19  * with this program; if not, write to the Free Software Foundation, Inc.,
20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21  * http://www.gnu.org/copyleft/gpl.html
22  *
23  * @file
24  * @ingroup Maintenance
25  */
26
27 /**
28  * Readahead helper for making large MediaWiki data dumps;
29  * reads in a previous XML dump to sequentially prefetch text
30  * records already normalized and decompressed.
31  *
32  * This can save load on the external database servers, hopefully.
33  *
34  * Assumes that dumps will be recorded in the canonical order:
35  * - ascending by page_id
36  * - ascending by rev_id within each page
37  * - text contents are immutable and should not change once
38  *   recorded, so the previous dump is a reliable source
39  *
40  * @ingroup Maintenance
41  */
42 class BaseDump {
43         /** @var XMLReader */
44         protected $reader = null;
45         protected $atEnd = false;
46         protected $atPageEnd = false;
47         protected $lastPage = 0;
48         protected $lastRev = 0;
49         protected $infiles = null;
50
51         public function __construct( $infile ) {
52                 $this->infiles = explode( ';', $infile );
53                 $this->reader = new XMLReader();
54                 $infile = array_shift( $this->infiles );
55                 if ( defined( 'LIBXML_PARSEHUGE' ) ) {
56                         $this->reader->open( $infile, null, LIBXML_PARSEHUGE );
57                 } else {
58                         $this->reader->open( $infile );
59                 }
60         }
61
62         /**
63          * Attempts to fetch the text of a particular page revision
64          * from the dump stream. May return null if the page is
65          * unavailable.
66          *
67          * @param int $page ID number of page to read
68          * @param int $rev ID number of revision to read
69          * @return string|null
70          */
71         function prefetch( $page, $rev ) {
72                 $page = intval( $page );
73                 $rev = intval( $rev );
74                 while ( $this->lastPage < $page && !$this->atEnd ) {
75                         $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
76                         $this->nextPage();
77                 }
78                 if ( $this->lastPage > $page || $this->atEnd ) {
79                         $this->debug( "BaseDump::prefetch already past page $page "
80                                 . "looking for rev $rev  [$this->lastPage, $this->lastRev]" );
81
82                         return null;
83                 }
84                 while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
85                         $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, "
86                                 . "looking for $page, $rev" );
87                         $this->nextRev();
88                 }
89                 if ( $this->lastRev == $rev && !$this->atEnd ) {
90                         $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
91
92                         return $this->nextText();
93                 } else {
94                         $this->debug( "BaseDump::prefetch already past rev $rev on page $page "
95                                 . "[$this->lastPage, $this->lastRev]" );
96
97                         return null;
98                 }
99         }
100
101         function debug( $str ) {
102                 wfDebug( $str . "\n" );
103                 // global $dumper;
104                 // $dumper->progress( $str );
105         }
106
107         /**
108          * @access private
109          */
110         function nextPage() {
111                 if ( $this->skipTo( 'page', 'mediawiki' ) ) {
112                         if ( $this->skipTo( 'id' ) ) {
113                                 $this->lastPage = intval( $this->nodeContents() );
114                                 $this->lastRev = 0;
115                                 $this->atPageEnd = false;
116                         }
117                 } else {
118                         $this->close();
119                         if ( count( $this->infiles ) ) {
120                                 $infile = array_shift( $this->infiles );
121                                 $this->reader->open( $infile );
122                                 $this->atEnd = false;
123                         }
124                 }
125         }
126
127         /**
128          * @access private
129          */
130         function nextRev() {
131                 if ( $this->skipTo( 'revision' ) ) {
132                         if ( $this->skipTo( 'id' ) ) {
133                                 $this->lastRev = intval( $this->nodeContents() );
134                         }
135                 } else {
136                         $this->atPageEnd = true;
137                 }
138         }
139
140         /**
141          * @access private
142          * @return string
143          */
144         function nextText() {
145                 $this->skipTo( 'text' );
146
147                 return strval( $this->nodeContents() );
148         }
149
150         /**
151          * @access private
152          * @param string $name
153          * @param string $parent
154          * @return bool|null
155          */
156         function skipTo( $name, $parent = 'page' ) {
157                 if ( $this->atEnd ) {
158                         return false;
159                 }
160                 while ( $this->reader->read() ) {
161                         if ( $this->reader->nodeType == XMLReader::ELEMENT
162                                 && $this->reader->name == $name
163                         ) {
164                                 return true;
165                         }
166                         if ( $this->reader->nodeType == XMLReader::END_ELEMENT
167                                 && $this->reader->name == $parent
168                         ) {
169                                 $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
170
171                                 return false;
172                         }
173                 }
174
175                 return $this->close();
176         }
177
178         /**
179          * Shouldn't something like this be built-in to XMLReader?
180          * Fetches text contents of the current element, assuming
181          * no sub-elements or such scary things.
182          *
183          * @return string
184          * @access private
185          */
186         function nodeContents() {
187                 if ( $this->atEnd ) {
188                         return null;
189                 }
190                 if ( $this->reader->isEmptyElement ) {
191                         return "";
192                 }
193                 $buffer = "";
194                 while ( $this->reader->read() ) {
195                         switch ( $this->reader->nodeType ) {
196                                 case XMLReader::TEXT:
197                                 // case XMLReader::WHITESPACE:
198                                 case XMLReader::SIGNIFICANT_WHITESPACE:
199                                         $buffer .= $this->reader->value;
200                                         break;
201                                 case XMLReader::END_ELEMENT:
202                                         return $buffer;
203                         }
204                 }
205
206                 return $this->close();
207         }
208
209         /**
210          * @access private
211          * @return null
212          */
213         function close() {
214                 $this->reader->close();
215                 $this->atEnd = true;
216
217                 return null;
218         }
219 }