]> scripts.mit.edu Git - autoinstalls/mediawiki.git/blob - maintenance/refreshLinks.php
MediaWiki 1.16.0
[autoinstalls/mediawiki.git] / maintenance / refreshLinks.php
1 <?php
2 /**
3  * This program is free software; you can redistribute it and/or modify
4  * it under the terms of the GNU General Public License as published by
5  * the Free Software Foundation; either version 2 of the License, or
6  * (at your option) any later version.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11  * GNU General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License along
14  * with this program; if not, write to the Free Software Foundation, Inc.,
15  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16  * http://www.gnu.org/copyleft/gpl.html
17  *
18  * @ingroup Maintenance
19  */
20
21 require_once( dirname(__FILE__) . '/Maintenance.php' );
22
23 class RefreshLinks extends Maintenance {
24         public function __construct() {
25                 parent::__construct();
26                 $this->mDescription = "Refresh link tables";
27                 $this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' );
28                 $this->addOption( 'new-only', 'Only affect articles with just a single edit' );
29                 $this->addOption( 'redirects-only', 'Only fix redirects, not all links' );
30                 $this->addOption( 'old-redirects-only', 'Only fix redirects with no redirect table entry' );
31                 $this->addOption( 'm', 'Maximum replication lag', false, true );
32                 $this->addOption( 'e', 'Last page id to refresh', false, true );
33                 $this->addArg( 'start', 'Page_id to start from, default 1', false );
34                 $this->setBatchSize( 100 );
35         }
36
37         public function execute() {
38                 if( !$this->hasOption( 'dfn-only' ) ) {
39                         $start = $this->getArg( 0, 1 );
40                         $new = $this->getOption( 'new-only', false );
41                         $max = $this->getOption( 'm', false );
42                         $end = $this->getOption( 'e', 0 );
43                         $redir = $this->getOption( 'redirects-only', false );
44                         $oldRedir = $this->getOption( 'old-redirects-only', false );
45                         $this->doRefreshLinks( $start, $new, $max, $end, $redir, $oldRedir );
46                 }
47                 $this->deleteLinksFromNonexistent( $max, $this->mBatchSize );
48         }
49
50         /**
51          * Do the actual link refreshing.
52          * @param $start int Page_id to start from
53          * @param $newOnly bool Only do pages with 1 edit
54          * @param $maxLag int Max DB replication lag
55          * @param $end int Page_id to stop at
56          * @param $redirectsOnly bool Only fix redirects
57          * @param $oldRedirectsOnly bool Only fix redirects without redirect entries
58          */
59         private function doRefreshLinks( $start, $newOnly = false, $maxLag = false, 
60                                                 $end = 0, $redirectsOnly = false, $oldRedirectsOnly = false ) {
61                 global $wgUser, $wgParser, $wgUseTidy;
62
63                 $reportingInterval = 100;
64                 $dbr = wfGetDB( DB_SLAVE );
65                 $start = intval( $start );
66
67                 # Don't generate TeX PNGs (lack of a sensible current directory causes errors anyway)
68                 $wgUser->setOption('math', MW_MATH_SOURCE);
69
70                 # Don't generate extension images (e.g. Timeline)
71                 if( method_exists( $wgParser, "clearTagHooks" ) ) {
72                         $wgParser->clearTagHooks();
73                 }
74
75                 # Don't use HTML tidy
76                 $wgUseTidy = false;
77
78                 $what = $redirectsOnly ? "redirects" : "links";
79
80                 if( $oldRedirectsOnly ) {
81                         # This entire code path is cut-and-pasted from below.  Hurrah.
82                         $res = $dbr->query(
83                                 "SELECT page_id ".
84                                 "FROM page ".
85                                 "LEFT JOIN redirect ON page_id=rd_from ".
86                                 "WHERE page_is_redirect=1 AND rd_from IS NULL AND ".
87                                 ($end == 0 ? "page_id >= $start"
88                                                    : "page_id BETWEEN $start AND $end"),
89                                 __METHOD__
90                         );
91                         $num = $dbr->numRows( $res );
92                         $this->output( "Refreshing $num old redirects from $start...\n" );
93
94                         foreach( $res as $row ) {
95                                 if ( !( ++$i % $reportingInterval ) ) {
96                                         $this->output( "$i\n" );
97                                         wfWaitForSlaves( $maxLag );
98                                 }
99                                 $this->fixRedirect( $row->page_id );
100                         }
101                 } elseif( $newOnly ) {
102                         $this->output( "Refreshing $what from " );
103                         $res = $dbr->select( 'page',
104                                 array( 'page_id' ),
105                                 array(
106                                         'page_is_new' => 1,
107                                         "page_id >= $start" ),
108                                 __METHOD__
109                         );
110                         $num = $dbr->numRows( $res );
111                         $this->output( "$num new articles...\n" );
112         
113                         $i = 0;
114                         foreach ( $res as $row ) {
115                                 if ( !( ++$i % $reportingInterval ) ) {
116                                         $this->output( "$i\n" );
117                                         wfWaitForSlaves( $maxLag );
118                                 }
119                                 if($redirectsOnly)
120                                         $this->fixRedirect( $row->page_id );
121                                 else
122                                         $this->fixLinksFromArticle( $row->page_id );
123                         }
124                 } else {
125                         if ( !$end ) {
126                                 $maxPage = $dbr->selectField( 'page', 'max(page_id)', false );
127                                 $maxRD = $dbr->selectField( 'redirect', 'max(rd_from)', false );
128                                 $end = max( $maxPage, $maxRD );
129                         }
130                         $this->output( "Refreshing redirects table.\n" );
131                         $this->output( "Starting from page_id $start of $end.\n" );
132         
133                         for ($id = $start; $id <= $end; $id++) {
134         
135                                 if ( !($id % $reportingInterval) ) {
136                                         $this->output( "$id\n" );
137                                         wfWaitForSlaves( $maxLag );
138                                 }
139                                 $this->fixRedirect( $id );
140                         }
141
142                         if(!$redirectsOnly) {
143                                 $this->output( "Refreshing links table.\n" );
144                                 $this->output( "Starting from page_id $start of $end.\n" );
145
146                                 for ($id = $start; $id <= $end; $id++) {
147         
148                                         if ( !($id % $reportingInterval) ) {
149                                                 $this->output( "$id\n" );
150                                                 wfWaitForSlaves( $maxLag );
151                                         }
152                                         $this->fixLinksFromArticle( $id );
153                                 }
154                         }
155                 }
156         }
157
158         /**
159          * Update the redirect entry for a given page
160          * @param $id int The page_id of the redirect
161          */
162         private function fixRedirect( $id ){
163                 global $wgTitle, $wgArticle;
164         
165                 $wgTitle = Title::newFromID( $id );
166                 $dbw = wfGetDB( DB_MASTER );
167         
168                 if ( is_null( $wgTitle ) ) {
169                         // This page doesn't exist (any more)
170                         // Delete any redirect table entry for it
171                         $dbw->delete( 'redirect', array( 'rd_from' => $id ),
172                                 __METHOD__ );
173                         return;
174                 }
175                 $wgArticle = new Article($wgTitle);
176         
177                 $rt = $wgArticle->followRedirect();
178         
179                 if($rt == false || !is_object($rt)) {
180                         // $wgTitle is not a redirect
181                         // Delete any redirect table entry for it
182                         $dbw->delete( 'redirect', array( 'rd_from' => $id ),
183                                 __METHOD__ );
184                 } else {
185                         $wgArticle->updateRedirectOn($dbw,$rt);
186                 }
187         }
188
189         /**
190          * Run LinksUpdate for all links on a given page_id
191          * @param $id int The page_id
192          */
193         private function fixLinksFromArticle( $id ) {
194                 global $wgTitle, $wgParser;
195
196                 $wgTitle = Title::newFromID( $id );
197                 $dbw = wfGetDB( DB_MASTER );
198
199                 $linkCache =& LinkCache::singleton();
200                 $linkCache->clear();
201
202                 if ( is_null( $wgTitle ) ) {
203                         return;
204                 }
205                 $dbw->begin();
206
207                 $revision = Revision::newFromTitle( $wgTitle );
208                 if ( !$revision ) {
209                         return;
210                 }
211
212                 $options = new ParserOptions;
213                 $parserOutput = $wgParser->parse( $revision->getText(), $wgTitle, $options, true, true, $revision->getId() );
214                 $update = new LinksUpdate( $wgTitle, $parserOutput, false );
215                 $update->doUpdate();
216                 $dbw->commit();
217         }
218
219         /*
220          * Removes non-existing links from pages from pagelinks, imagelinks,
221          * categorylinks, templatelinks and externallinks tables.
222          *
223          * @param $maxLag
224          * @param $batchSize The size of deletion batches
225          *
226          * @author Merlijn van Deen <valhallasw@arctus.nl>
227          */
228         private function deleteLinksFromNonexistent( $maxLag = 0, $batchSize = 100 ) {
229                 wfWaitForSlaves( $maxLag );
230
231                 $dbw = wfGetDB( DB_MASTER );
232
233                 $lb = wfGetLBFactory()->newMainLB();
234                 $dbr = $lb->getConnection( DB_SLAVE );
235                 $dbr->bufferResults( false );
236
237                 $linksTables = array( // table name => page_id field
238                         'pagelinks' => 'pl_from',
239                         'imagelinks' => 'il_from',
240                         'categorylinks' => 'cl_from',
241                         'templatelinks' => 'tl_from',
242                         'externallinks' => 'el_from',
243                 );
244
245                 foreach ( $linksTables as $table => $field ) {
246                         $this->output( "Retrieving illegal entries from $table... " );
247
248                         // SELECT DISTINCT( $field ) FROM $table LEFT JOIN page ON $field=page_id WHERE page_id IS NULL;
249                         $results = $dbr->select( array( $table, 'page' ),
250                                                   $field,
251                                                   array('page_id' => null ),
252                                                   __METHOD__,
253                                                   'DISTINCT',
254                                                   array( 'page' => array( 'LEFT JOIN', "$field=page_id"))
255                         );
256
257                         $counter = 0;
258                         $list = array();
259                         $this->output( "0.." );
260
261                         foreach( $results as $row ) {
262                                 $counter++;
263                                 $list[] = $row->$field;
264                                 if ( ( $counter % $batchSize ) == 0 ) {
265                                         wfWaitForSlaves(5);
266                                         $dbw->delete( $table, array( $field => $list ), __METHOD__ );
267
268                                         $this->output( $counter . ".." );
269                                         $list = array();
270                                 }
271                         }
272                         $this->output( $counter );
273                         if (count($list) > 0) {
274                                 $dbw->delete( $table, array( $field => $list ), __METHOD__ );
275                         }
276                         $this->output( "\n" );
277                 }
278                 $lb->closeAll();
279         }
280 }
281
282 $maintClass = 'RefreshLinks';
283 require_once( DO_MAINTENANCE );