root/galaxy-central/scripts/cleanup_datasets/cleanup_datasets.py @ 2

リビジョン 2, 24.5 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2
3import os, sys
4
5new_path = [ os.path.join( os.getcwd(), "lib" ) ]
6new_path.extend( sys.path[1:] ) # remove scripts/ from the path
7sys.path = new_path
8
9from galaxy import eggs
10import pkg_resources 
11pkg_resources.require( "SQLAlchemy >= 0.4" )
12
13import time, ConfigParser, shutil
14from datetime import datetime, timedelta
15from time import strftime
16from optparse import OptionParser
17
18import galaxy.model.mapping
19import sqlalchemy as sa
20from galaxy.model.orm import and_, eagerload
21
22assert sys.version_info[:2] >= ( 2, 4 )
23
24def main():
25    parser = OptionParser()
26    parser.add_option( "-d", "--days", dest="days", action="store", type="int", help="number of days (60)", default=60 )
27    parser.add_option( "-r", "--remove_from_disk", action="store_true", dest="remove_from_disk", help="remove datasets from disk when purged", default=False )
28    parser.add_option( "-i", "--info_only", action="store_true", dest="info_only", help="info about the requested action", default=False )
29    parser.add_option( "-f", "--force_retry", action="store_true", dest="force_retry", help="performs the requested actions, but ignores whether it might have been done before. Useful when -r wasn't used, but should have been", default=False )
30    parser.add_option( "-1", "--delete_userless_histories", action="store_true", dest="delete_userless_histories", default=False, help="delete userless histories and datasets" )
31    parser.add_option( "-2", "--purge_histories", action="store_true", dest="purge_histories", default=False, help="purge deleted histories" )
32    parser.add_option( "-3", "--purge_datasets", action="store_true", dest="purge_datasets", default=False, help="purge deleted datasets" )
33    parser.add_option( "-4", "--purge_libraries", action="store_true", dest="purge_libraries", default=False, help="purge deleted libraries" )
34    parser.add_option( "-5", "--purge_folders", action="store_true", dest="purge_folders", default=False, help="purge deleted library folders" )
35    parser.add_option( "-6", "--delete_datasets", action="store_true", dest="delete_datasets", default=False, help="mark deletable datasets as deleted and purge associated dataset instances" )
36
37    ( options, args ) = parser.parse_args()
38    ini_file = args[0]
39   
40    if not ( options.purge_folders ^ options.delete_userless_histories ^ \
41             options.purge_libraries ^ options.purge_histories ^ \
42             options.purge_datasets ^ options.delete_datasets ):
43        parser.print_help()
44        sys.exit(0)
45   
46    if options.remove_from_disk and options.info_only:
47        parser.error( "remove_from_disk and info_only are mutually exclusive" )
48   
49    conf_parser = ConfigParser.ConfigParser( {'here':os.getcwd()} )
50    conf_parser.read( ini_file )
51    configuration = {}
52    for key, value in conf_parser.items( "app:main" ):
53        configuration[key] = value
54   
55    if 'database_connection' in configuration:
56        database_connection = configuration['database_connection']
57    else:
58        database_connection = "sqlite:///%s?isolation_level=IMMEDIATE" % configuration["database_file"]
59    file_path = configuration.get('file_path', "database/files")
60    app = CleanupDatasetsApplication( database_connection=database_connection, file_path=file_path )
61    cutoff_time = datetime.utcnow() - timedelta( days=options.days )
62    now = strftime( "%Y-%m-%d %H:%M:%S" )
63   
64    print "##########################################"
65    print "\n# %s - Handling stuff older than %i days" % ( now, options.days )
66   
67    if options.info_only:
68        print "# Displaying info only ( --info_only )\n"
69    elif options.remove_from_disk:
70        print "Datasets will be removed from disk.\n"
71    else:
72        print "Datasets will NOT be removed from disk.\n"
73   
74    if options.delete_userless_histories:
75        delete_userless_histories( app, cutoff_time, info_only = options.info_only, force_retry = options.force_retry )
76    elif options.purge_histories:
77        purge_histories( app, cutoff_time, options.remove_from_disk, info_only = options.info_only, force_retry = options.force_retry )
78    elif options.purge_datasets:
79        purge_datasets( app, cutoff_time, options.remove_from_disk, info_only = options.info_only, force_retry = options.force_retry )
80    elif options.purge_libraries:
81        purge_libraries( app, cutoff_time, options.remove_from_disk, info_only = options.info_only, force_retry = options.force_retry )
82    elif options.purge_folders:
83        purge_folders( app, cutoff_time, options.remove_from_disk, info_only = options.info_only, force_retry = options.force_retry )
84    elif options.delete_datasets:
85        delete_datasets( app, cutoff_time, options.remove_from_disk, info_only = options.info_only, force_retry = options.force_retry )
86   
87    sys.exit(0)
88
89def delete_userless_histories( app, cutoff_time, info_only = False, force_retry = False ):
90    # Deletes userless histories whose update_time value is older than the cutoff_time.
91    # The purge history script will handle marking DatasetInstances as deleted.
92    # Nothing is removed from disk yet.
93    history_count = 0
94    start = time.time()
95    if force_retry:
96        histories = app.sa_session.query( app.model.History ) \
97                                  .filter( and_( app.model.History.table.c.user_id==None,
98                                                 app.model.History.table.c.update_time < cutoff_time ) )
99    else:
100        histories = app.sa_session.query( app.model.History ) \
101                                  .filter( and_( app.model.History.table.c.user_id==None,
102                                                 app.model.History.table.c.deleted==False,
103                                                 app.model.History.table.c.update_time < cutoff_time ) )
104    for history in histories:
105        if not info_only:
106            print "Deleting history id ", history.id
107            history.deleted = True
108            app.sa_session.add( history )
109            app.sa_session.flush()
110        history_count += 1
111    stop = time.time()
112    print "Deleted %d histories" % history_count
113    print "Elapsed time: ", stop - start
114    print "##########################################"
115
116def purge_histories( app, cutoff_time, remove_from_disk, info_only = False, force_retry = False ):
117    # Purges deleted histories whose update_time is older than the cutoff_time.
118    # The dataset associations of each history are also marked as deleted.
119    # The Purge Dataset method will purge each Dataset as necessary
120    # history.purged == True simply means that it can no longer be undeleted
121    # i.e. all associated datasets are marked as deleted
122    history_count = 0
123    start = time.time()
124    if force_retry:
125        histories = app.sa_session.query( app.model.History ) \
126                                  .filter( and_( app.model.History.table.c.deleted==True,
127                                                 app.model.History.table.c.update_time < cutoff_time ) ) \
128                                  .options( eagerload( 'datasets' ) )
129    else:
130        histories = app.sa_session.query( app.model.History ) \
131                                  .filter( and_( app.model.History.table.c.deleted==True,
132                                                 app.model.History.table.c.purged==False,
133                                                 app.model.History.table.c.update_time < cutoff_time ) ) \
134                                  .options( eagerload( 'datasets' ) )
135    for history in histories:
136        for dataset_assoc in history.datasets:
137            _purge_dataset_instance( dataset_assoc, app, remove_from_disk, info_only = info_only ) #mark a DatasetInstance as deleted, clear associated files, and mark the Dataset as deleted if it is deletable
138        if not info_only:
139            # TODO: should the Delete DefaultHistoryPermissions be deleted here?  This was incorrectly
140            # done in the _list_delete() method of the history controller, so copied it here.  Not sure
141            # if we should ever delete info like this from the db though, so commented out for now...
142            #for dhp in history.default_permissions:
143            #    dhp.delete()
144            print "Purging history id ", history.id
145            history.purged = True
146            app.sa_session.add( history )
147            app.sa_session.flush()
148        history_count += 1
149    stop = time.time()
150    print 'Purged %d histories.' % history_count
151    print "Elapsed time: ", stop - start
152    print "##########################################"
153
154def purge_libraries( app, cutoff_time, remove_from_disk, info_only = False, force_retry = False ):
155    # Purges deleted libraries whose update_time is older than the cutoff_time.
156    # The dataset associations of each library are also marked as deleted.
157    # The Purge Dataset method will purge each Dataset as necessary
158    # library.purged == True simply means that it can no longer be undeleted
159    # i.e. all associated LibraryDatasets/folders are marked as deleted
160    library_count = 0
161    start = time.time()
162    if force_retry:
163        libraries = app.sa_session.query( app.model.Library ) \
164                                  .filter( and_( app.model.Library.table.c.deleted==True,
165                                                 app.model.Library.table.c.update_time < cutoff_time ) )
166    else:
167        libraries = app.sa_session.query( app.model.Library ) \
168                                  .filter( and_( app.model.Library.table.c.deleted==True,
169                                                 app.model.Library.table.c.purged==False,
170                                                 app.model.Library.table.c.update_time < cutoff_time ) )
171    for library in libraries:
172        _purge_folder( library.root_folder, app, remove_from_disk, info_only = info_only )
173        if not info_only:
174            print "Purging library id ", library.id
175            library.purged = True
176            app.sa_session.add( library )
177            app.sa_session.flush()
178        library_count += 1
179    stop = time.time()
180    print '# Purged %d libraries .' % library_count
181    print "Elapsed time: ", stop - start
182    print "##########################################"
183
184def purge_folders( app, cutoff_time, remove_from_disk, info_only = False, force_retry = False ):
185    # Purges deleted folders whose update_time is older than the cutoff_time.
186    # The dataset associations of each folder are also marked as deleted.
187    # The Purge Dataset method will purge each Dataset as necessary
188    # libraryFolder.purged == True simply means that it can no longer be undeleted
189    # i.e. all associated LibraryDatasets/folders are marked as deleted
190    folder_count = 0
191    start = time.time()
192    if force_retry:
193        folders = app.sa_session.query( app.model.LibraryFolder ) \
194                                .filter( and_( app.model.LibraryFolder.table.c.deleted==True,
195                                               app.model.LibraryFolder.table.c.update_time < cutoff_time ) )
196    else:
197        folders = app.sa_session.query( app.model.LibraryFolder ) \
198                                .filter( and_( app.model.LibraryFolder.table.c.deleted==True,
199                                               app.model.LibraryFolder.table.c.purged==False,
200                                               app.model.LibraryFolder.table.c.update_time < cutoff_time ) )
201    for folder in folders:
202        _purge_folder( folder, app, remove_from_disk, info_only = info_only )
203        folder_count += 1
204    stop = time.time()
205    print '# Purged %d folders.' % folder_count
206    print "Elapsed time: ", stop - start
207    print "##########################################"
208
209def delete_datasets( app, cutoff_time, remove_from_disk, info_only = False, force_retry = False ):
210    # Marks datasets as deleted if associated items are all deleted.
211    start = time.time()
212    if force_retry:
213        history_dataset_ids_query = sa.select( ( app.model.Dataset.table.c.id,
214                                                 app.model.Dataset.table.c.state ),
215                                               whereclause = app.model.HistoryDatasetAssociation.table.c.update_time < cutoff_time,
216                                               from_obj = [ sa.outerjoin( app.model.Dataset.table,
217                                                                          app.model.HistoryDatasetAssociation.table ) ] )
218        library_dataset_ids_query = sa.select( ( app.model.Dataset.table.c.id,
219                                                 app.model.Dataset.table.c.state ),
220                                                whereclause = app.model.LibraryDatasetDatasetAssociation.table.c.update_time < cutoff_time,
221                                                from_obj = [ sa.outerjoin( app.model.Dataset.table,
222                                                                           app.model.LibraryDatasetDatasetAssociation.table ) ] )
223    else:                                 
224        # We really only need the id column here, but sqlalchemy barfs when trying to select only 1 column
225        history_dataset_ids_query = sa.select( ( app.model.Dataset.table.c.id,
226                                                 app.model.Dataset.table.c.state ),
227                                               whereclause = sa.and_( app.model.Dataset.table.c.deleted == False,
228                                                                      app.model.HistoryDatasetAssociation.table.c.update_time < cutoff_time,
229                                                                      app.model.HistoryDatasetAssociation.table.c.deleted == True ),
230                                               from_obj = [ sa.outerjoin( app.model.Dataset.table,
231                                                                          app.model.HistoryDatasetAssociation.table ) ] )
232        library_dataset_ids_query = sa.select( ( app.model.Dataset.table.c.id,
233                                                 app.model.Dataset.table.c.state ),
234                                                whereclause = sa.and_( app.model.Dataset.table.c.deleted == False,
235                                                                       app.model.LibraryDatasetDatasetAssociation.table.c.update_time < cutoff_time,
236                                                                       app.model.LibraryDatasetDatasetAssociation.table.c.deleted == True ),
237                                                from_obj = [ sa.outerjoin( app.model.Dataset.table,
238                                                                           app.model.LibraryDatasetDatasetAssociation.table ) ] )                       
239    history_dataset_ids = [ row.id for row in history_dataset_ids_query.execute() ]
240    library_dataset_ids = [ row.id for row in library_dataset_ids_query.execute() ]
241    dataset_ids = history_dataset_ids + library_dataset_ids
242    skip = []
243    deleted_dataset_count = 0
244    deleted_instance_count = 0
245    for dataset_id in dataset_ids:
246        print "######### Processing dataset id:", dataset_id
247        dataset = app.sa_session.query( app.model.Dataset ).get( dataset_id )
248        if dataset.id not in skip and _dataset_is_deletable( dataset ):
249            deleted_dataset_count += 1
250            for dataset_instance in dataset.history_associations + dataset.library_associations:
251                print "Associated Dataset instance: ", dataset_instance.__class__.__name__, dataset_instance.id
252                _purge_dataset_instance( dataset_instance, app, remove_from_disk, include_children=True, info_only=info_only, is_deletable=True )
253                deleted_instance_count += 1
254        skip.append( dataset.id )
255    stop = time.time()
256    print "Examined %d datasets, marked %d as deleted and purged %d dataset instances" % ( len( skip ), deleted_dataset_count, deleted_instance_count )
257    print "Total elapsed time: ", stop - start
258    print "##########################################"
259
260def purge_datasets( app, cutoff_time, remove_from_disk, info_only = False, force_retry = False ):
261    # Purges deleted datasets whose update_time is older than cutoff_time.  Files may or may
262    # not be removed from disk.
263    dataset_count = 0
264    disk_space = 0
265    start = time.time()
266    if force_retry:
267        datasets = app.sa_session.query( app.model.Dataset ) \
268                                 .filter( and_( app.model.Dataset.table.c.deleted==True,
269                                                app.model.Dataset.table.c.purgable==True,
270                                                app.model.Dataset.table.c.update_time < cutoff_time ) )
271    else:
272        datasets = app.sa_session.query( app.model.Dataset ) \
273                                 .filter( and_( app.model.Dataset.table.c.deleted==True,
274                                                app.model.Dataset.table.c.purgable==True,
275                                                app.model.Dataset.table.c.purged==False,
276                                                app.model.Dataset.table.c.update_time < cutoff_time ) )
277    for dataset in datasets:
278        file_size = dataset.file_size
279        _purge_dataset( app, dataset, remove_from_disk, info_only = info_only )
280        dataset_count += 1
281        try:
282            disk_space += file_size
283        except:
284            pass
285    stop = time.time()
286    print 'Purged %d datasets' % dataset_count
287    if remove_from_disk:
288        print 'Freed disk space: ', disk_space
289    print "Elapsed time: ", stop - start
290    print "##########################################"
291
292def _purge_dataset_instance( dataset_instance, app, remove_from_disk, include_children=True, info_only=False, is_deletable=False ):
293    # A dataset_instance is either a HDA or an LDDA.  Purging a dataset instance marks the instance as deleted,
294    # and marks the associated dataset as deleted if it is not associated with another active DatsetInstance.
295    if not info_only:
296        print "Deleting dataset_instance ", str( dataset_instance ), " id ", dataset_instance.id
297        dataset_instance.mark_deleted( include_children = include_children )
298        dataset_instance.clear_associated_files()
299        app.sa_session.add( dataset_instance )
300        app.sa_session.flush()
301        app.sa_session.refresh( dataset_instance.dataset )
302    if is_deletable or _dataset_is_deletable( dataset_instance.dataset ):
303        # Calling methods may have already checked _dataset_is_deletable, if so, is_deletable should be True
304        _delete_dataset( dataset_instance.dataset, app, remove_from_disk, info_only=info_only, is_deletable=is_deletable )
305    #need to purge children here
306    if include_children:
307        for child in dataset_instance.children:
308            _purge_dataset_instance( child, app, remove_from_disk, include_children = include_children, info_only = info_only )
309
310def _dataset_is_deletable( dataset ):
311    #a dataset is deletable when it no longer has any non-deleted associations
312    return not bool( dataset.active_history_associations or dataset.active_library_associations )
313
314def _delete_dataset( dataset, app, remove_from_disk, info_only=False, is_deletable=False ):
315    #marks a base dataset as deleted, hdas/ldas associated with dataset can no longer be undeleted
316    #metadata files attached to associated dataset Instances is removed now
317    if not is_deletable and not _dataset_is_deletable( dataset ):
318        print "This Dataset (%i) is not deletable, associated Metadata Files will not be removed.\n" % ( dataset.id )
319    else:
320        # Mark all associated MetadataFiles as deleted and purged and remove them from disk
321        metadata_files = []
322        #lets create a list of metadata files, then perform actions on them
323        for hda in dataset.history_associations:
324            for metadata_file in app.sa_session.query( app.model.MetadataFile ) \
325                                               .filter( app.model.MetadataFile.table.c.hda_id==hda.id ):
326                metadata_files.append( metadata_file )
327        for lda in dataset.library_associations:
328            for metadata_file in app.sa_session.query( app.model.MetadataFile ) \
329                                               .filter( app.model.MetadataFile.table.c.lda_id==lda.id ):
330                metadata_files.append( metadata_file )
331        for metadata_file in metadata_files:
332            print "The following metadata files attached to associations of Dataset '%s' have been purged:" % dataset.id
333            if not info_only:
334                if remove_from_disk:
335                    try:
336                        print "Removing disk file ", metadata_file.file_name
337                        os.unlink( metadata_file.file_name )
338                    except Exception, e:
339                        print "Error, exception: %s caught attempting to purge metadata file %s\n" %( str( e ), metadata_file.file_name )
340                    metadata_file.purged = True
341                    app.sa_session.add( metadata_file )
342                    app.sa_session.flush()
343                metadata_file.deleted = True
344                app.sa_session.add( metadata_file )
345                app.sa_session.flush()
346            print "%s" % metadata_file.file_name
347        print "Deleting dataset id", dataset.id
348        dataset.deleted = True
349        app.sa_session.add( dataset )
350        app.sa_session.flush()
351
352def _purge_dataset( app, dataset, remove_from_disk, info_only = False ):
353    if dataset.deleted:
354        try:
355            if dataset.purgable and _dataset_is_deletable( dataset ):
356                if not info_only:
357                    # Remove files from disk and update the database
358                    if remove_from_disk:
359                        # TODO: should permissions on the dataset be deleted here?
360                        print "Removing disk, file ", dataset.file_name
361                        os.unlink( dataset.file_name )
362                        # Remove associated extra files from disk if they exist
363                        if dataset.extra_files_path and os.path.exists( dataset.extra_files_path ):
364                            shutil.rmtree( dataset.extra_files_path ) #we need to delete the directory and its contents; os.unlink would always fail on a directory
365                    print "Purging dataset id", dataset.id
366                    dataset.purged = True
367                    app.sa_session.add( dataset )
368                    app.sa_session.flush()
369            else:
370                print "This dataset (%i) is not purgable, the file (%s) will not be removed.\n" % ( dataset.id, dataset.file_name )
371        except OSError, exc:
372            print "Error, dataset file has already been removed: %s" % str( exc )
373            print "Purging dataset id", dataset.id
374            dataset.purged = True
375            app.sa_session.add( dataset )
376            app.sa_session.flush()
377        except Exception, exc:
378            print "Error attempting to purge data file: ", dataset.file_name, " error: ", str( exc )
379    else:
380        print "Error: '%s' has not previously been deleted, so it cannot be purged\n" % dataset.file_name
381
382def _purge_folder( folder, app, remove_from_disk, info_only = False ):
383    """Purges a folder and its contents, recursively"""
384    for ld in folder.datasets:
385        print "Deleting library dataset id ", ld.id
386        ld.deleted = True
387        for ldda in [ld.library_dataset_dataset_association] + ld.expired_datasets:
388            _purge_dataset_instance( ldda, app, remove_from_disk, info_only = info_only ) #mark a DatasetInstance as deleted, clear associated files, and mark the Dataset as deleted if it is deletable
389    for sub_folder in folder.folders:
390        _purge_folder( sub_folder, app, remove_from_disk, info_only = info_only )
391    if not info_only:
392        # TODO: should the folder permissions be deleted here?
393        print "Purging folder id ", folder.id
394        folder.purged = True
395        app.sa_session.add( folder )
396        app.sa_session.flush()
397
398class CleanupDatasetsApplication( object ):
399    """Encapsulates the state of a Universe application"""
400    def __init__( self, database_connection=None, file_path=None ):
401        if database_connection is None:
402            raise Exception( "CleanupDatasetsApplication requires a database_connection value" )
403        if file_path is None:
404            raise Exception( "CleanupDatasetsApplication requires a file_path value" )
405        self.database_connection = database_connection
406        self.file_path = file_path
407        # Setup the database engine and ORM
408        self.model = galaxy.model.mapping.init( self.file_path, self.database_connection, engine_options={}, create_tables=False )
409    @property
410    def sa_session( self ):
411        """
412        Returns a SQLAlchemy session -- currently just gets the current
413        session from the threadlocal session context, but this is provided
414        to allow migration toward a more SQLAlchemy 0.4 style of use.
415        """
416        return self.model.context.current
417
418if __name__ == "__main__": main()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。