#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright 2015 Tealium Inc. All Rights Reserved. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # History # 2015-04-01: Inital build. Tested using Boto version 2.36.0 """ Class for extracting data from AWS S3 Requirements: Must install Boto """ __author__ = 'Aaron Toledo (aaron.toledo@tealium.com)' __copyright__ = 'Copyright 2015, Tealium Inc.' __credits__ = ['Aaron Toledo'] __license__ = 'Tealium Inc.' __version__ = '1.0' __maintainer__ = 'Aaron Toledo' __email__ = 'aaron.toledo@tealium.com' __status__ = 'Prototype' from boto.s3.connection import S3Connection from boto.s3.connection import OrdinaryCallingFormat from boto.s3.key import Key import sys, os, hashlib, re # Class for conneting to S3 via Boto # Features: Features for your S3 bucket # 1. List all files # 2. List new files that you don't have locally yet # 3. Download new files that you don't have locally yet ############################## class TealiumS3: # Initialize the class def __init__(self,creds): self.access_key = creds['access_key'] self.secret_key = creds['secret_key'] self.bucket = creds['bucket'] self.prefix = creds['prefix'] self.local_path = creds['local_path'] self.conn = self.__tealiumConnect() # Private class for authentication def __tealiumConnect(self): return S3Connection(aws_access_key_id=self.access_key,aws_secret_access_key=self.secret_key,calling_format=OrdinaryCallingFormat()) # Private class for counting new files to download def __listNewFiles(self): b = self.conn.get_bucket(self.bucket,validate=False) found = True for k in b.list(self.prefix): k_string = str(k.key) try: if not os.path.exists(self.local_path+k_string): if found: found = False counter = 0 counter+=1 except OSError as e: print 'FAILED'+' : '+e.strerror+' : '+k.name if found: return 0 else: return counter # Public class for listing all files in bucket def listAllFiles(self): b = self.conn.get_bucket(self.bucket,validate=False) counter = 0 print '------------------------------------' print 'Listing all files:' print '------------------------------------' print '"Name" "Size" "Last Modified"' for k in b.list(self.prefix): print k.name.encode('utf-8'), k.size, k.last_modified counter+=1 print '------------------------------------' print 'Total files: ' + str(counter) print '------------------------------------' # Public class for listing all files in bucket def listNewFiles(self): b = self.conn.get_bucket(self.bucket,validate=False) found = True for k in b.list(self.prefix): k_string = str(k.key) try: if not os.path.exists(self.local_path+k_string): if found: print '------------------------------------' print 'Listing new files:' print '------------------------------------' print '"Name" "Size" "Last Modified"' found = False counter = 0 print k.name.encode('utf-8'), k.size, k.last_modified counter+=1 except OSError as e: print 'FAILED'+' : '+e.strerror+' : '+k.name if found: print 'No new files found.' else: print '------------------------------------' print 'Total files: ' + str(counter) print '------------------------------------' # Public class downloading files that haven't been downloaded yet def getFiles(self): b = self.conn.get_bucket(self.bucket,validate=False) found = True total_records = self.__listNewFiles() if total_records > 0: for k in b.list(self.prefix): k_string = str(k.key) try: if not os.path.exists(self.local_path+k_string): if found: print 'Downloading new files to: ' + self.local_path found = False counter = 0 k.get_contents_to_filename(self.local_path+k_string) counter+=1 output = ' ' + str(int((float(counter)/float(total_records)*100))) + ' percent complete. ' + str(counter) + ' of ' + str(total_records) + ' files.' sys.stdout.write('%s\r' % output) sys.stdout.flush() except OSError as e: print 'FAILED'+' : '+e.strerror+' : '+k.name print 'MESSAGE: You must mirror the folder structure on AWS S3.' print '' print 'Total files downloaded: '+str(counter)+' files.' else: print 'No new files to download.' # Public class downloading single file # aws s3api get-object --bucket uconnect.tealiumiq.com --key services-aaron/main/events/60f94dd5-b570-4612-c59a-c367e6213999/services-aaron-main-events-60f94dd5-b570-4612-c59a-c367e6213999-20150523070521-04af10ee793c4ee590e35a6bf874f93d.gzip /Users/aarontoledo/Desktop/temp/file.gzip def getSingleFile(self): b = self.conn.get_bucket(self.bucket,validate=False) k_string = raw_input('Enter a key:') while True: try: k = b.get_key(k_string) k.get_contents_to_filename(self.local_path+k_string) print 'AWS Bucket: %s' % self.bucket print 'AWS Key: %s' % k.key print 'AWS Size: %s' % k.size print 'AWS ETag Hash: %s' % re.sub('"','',str(k.etag)) hasher = hashlib.md5() with open(self.local_path+k_string, 'rb') as afile: buf = afile.read() hasher.update(buf) print 'AWS Local Hash: %s' % hasher.hexdigest() print 'Hashes match: %s' % str(hasher.hexdigest() == re.sub('"','',str(k.etag))) break except: print 'FAILED: Key does not exist' k_string = raw_input('Re-enter a key:') continue