#!usr/bin/env python from __future__ import print_function ################################################################################ # A GENERAL EXPLANATION """ folding.py This script performs fragment insertion for an input protein sequence. The sequence may be explicit, in a FASTA file, or in a PDB file. Two fragment files, preferably one longer than the other, must also be provided. Fragment insertion is accompanied by a Monte Carlo assessment allowing the conformation to escape local minima. Output structures from this protocol are intended to proceed into a refinement step (such as that in refinement.py) to produce reasonable estimates of the protein conformation. Instructions: 1) ensure that your PDB file is in the current directory 2) run the script: from commandline >python D060_Folding.py from within python/ipython [1]: run D060_Folding.py Author: Evan H. Baugh revised and motivated by Robert Schleif Updated by Boon Uranukul, 6/9/12 Simplified special constant seed initialization ~ Labonte References: P. Bradley, K. Misura, and D. Baker, "Toward high-resolution de novo structure prediction for small proteins," Science 309 (5742) 1868-1871 (2005). """ ################################################################################ # THE BASIC PROTOCOL, sample_folding """ This sample script is setup for usage with commandline arguments, default running within a python interpreter, or for import within a python interpreter, (exposing the sample_folding method) The method sample_folding: 1. creates a pose from the desired sequence (fullatom) 2. set all of the pose's backbone torsion angles 3. creates a (fullatom) reference copy of the pose 4. creates Movers for switching between fullatom and centroid 5. convert both poses to centroid 6. creates a MoveMap with all backbone torsion angles free 7. sets up ClassicFragmentMovers for inserting fragments backbone structures for long and short fragments 8. creates RepeatMovers for the long and short fragment Movers 9. create a PyMOL_Observer for viewing intermediate output 10. creates low and high resolution ScoreFunctions 11. sets up a RepeatMover on a TrialMover of a SequenceMover -setup the TrialMover a. create a SequenceMover with the: >RepeatMover on the long fragment Mover >RepeatMover on the short fragment Mover b. create a MonteCarlo object for assessing moves c. create the TrialMover (on the SequenceMover) -create the RepeatMover (on the TrialMover) 12. creates a (Py)JobDistributor for managing multiple trajectories 13. stores the original score evaluation (centroid) 14. performs low-resolution folding: a. set necessary variables -reload the starting (centroid) pose -change the pose's PDBInfo.name, for the PyMOLMover -reset the MonteCarlo object (for the TrialMover) b. perform sampling and assessment using the final RepeatMover c. convert the lowest scoring decoy to fullatom using the SequenceMover d. export the (lowest scoring) decoy structure -recover the lowest scoring (centroid) decoy -store the decoy score -convert the decoy to fullatom -guess potential disulfide bridges -output the decoy structure using the PyJobDistributor -export the decoy structure using the PyMOL_Observer 15. outputs the score evaluations The second method, guess_disulfides, is called in sample_folding to print out cysteine residues which are close to each other in the decoy structure. """ import optparse # for sorting options from rosetta import * from pyrosetta import * init(extra_options = "-constant_seed") # normally, init() works fine # for this sample script, we want to ease comparison by making sure all random # variables generated by Rosetta in this instance of PyRosetta start from a # constant seed # here we provide the additional argument "-constant_seed" which sets all the # random variables generated by Rosetta from a constant seed (google random # seed for more information) # some options can be set after initialization, please see PyRosetta.org FAQs # for more information # WARNING: option '-constant_seed' is for testing only! MAKE SURE TO REMOVE IT IN PRODUCTION RUNS!!!!! import os; os.chdir('.test.output') ######### # Methods def sample_folding(sequence, long_frag_filename, long_frag_length, short_frag_filename, short_frag_length, kT = 3.0, long_inserts = 1, short_inserts = 3, cycles = 40, jobs = 1, job_output = 'fold_output'): """ Performs exporting structures to a PyMOL instance Output structures are named _(job#).pdb """ # 1. create a pose from the desired sequence (fullatom) pose = Pose() # the method make_pose_from_sequence produces a complete IDEALIZED # protein conformation of the input sequence, the ResidueTypeSet (third # argument below) may be varied, and this method supports non-proteogenic # chemistry (though it is still a Rosetta Residue). however this syntax # is more involved and not robust to user errors, and not presented here # small differences in bond lengths and bond angles WILL change the results, #### if you desire an alternate starting conformation, alter steps #### 1. and 2. as you please make_pose_from_sequence(pose, sequence, 'fa_standard') # 2. linearize the pose by setting backbone torsions to large values # the method make_pose_from_sequence does not create the new pose's # PDBInfo object, so its done here, without it an error occurs later pose.pdb_info(rosetta.core.pose.PDBInfo( pose.total_residue() )) for i in range(1, pose.total_residue() + 1): pose.set_omega(i, 180) pose.set_phi(i, -150) # reasonably straight pose.set_psi(i, 150) #### if you want to see the decoy scores, the PDBInfo needs these lines #pose.pdb_info().chain(i, 'A') # necessary to color by score #pose.pdb_info().number(i, i) # for PDB numbering #### # 3. create a (fullatom) reference copy of the pose test_pose = Pose() test_pose.assign( pose ) test_pose.pdb_info().name('linearized pose') # 4. create centroid <--> fullatom conversion Movers to_centroid = SwitchResidueTypeSetMover('centroid') # centroid Residue objects, of amino acids, have all their sidechain atoms # replaced by a single representative "atom" to speed up calculations to_fullatom = SwitchResidueTypeSetMover('fa_standard') # 5. convert the poses to centroid to_centroid.apply(pose) to_centroid.apply(test_pose) # 6. create the MoveMap, all backbone torsions free movemap = MoveMap() movemap.set_bb(True) # minimizing the centroid chi angles (the sidechain centroid atoms) is # almost always USELESS since this compression is performed for speed, # not accuracy and clashes usually occur when converting to fullatom # 7. setup the ClassicFragmentMovers # for the long fragments file # this "try--except" is used to catch improper fragment files try: fragset_long = core.fragment.ConstantLengthFragSet( long_frag_length , long_frag_filename ) #### the ConstantLengthFragSet is overloaded, this same #### ConstantLengthFragSet can be obtained with different syntax # to obtain custom fragments, see Generating Fragment Files below except: raise IOError('Make sure long_frag_length matches the fragments in\n\ long_frag_file and that long_frag_file is valid') long_frag_mover = protocols.simple_moves.ClassicFragmentMover(fragset_long, movemap) # and for the short fragments file # this "try--except" is used to catch improper fragment files try: fragset_short = core.fragment.ConstantLengthFragSet(short_frag_length, short_frag_filename) except: raise IOError('Make sure short_frag_length matches the fragments in\n\ short_frag_file and that short_frag_file is valid') short_frag_mover = protocols.simple_moves.ClassicFragmentMover(fragset_short, movemap) # 8. setup RepeatMovers for the ClassicFragmentMovers insert_long_frag = protocols.moves.RepeatMover(long_frag_mover, long_inserts) insert_short_frag = protocols.moves.RepeatMover(short_frag_mover, short_inserts) # 9. create a PyMOL_Observer for exporting structures to PyMOL (optional) # the PyMOL_Observer object owns a PyMOLMover and monitors pose objects for # structural changes, when changes are detected the new structure is # sent to PyMOL # fortunately, this allows investigation of full protocols since # intermediate changes are displayed, it also eliminates the need to # manually apply the PyMOLMover during a custom protocol # unfortunately, this can make the output difficult to interpret (since you # aren't explicitly telling it when to export) and can significantly slow # down protocols since many structures are output (PyMOL can also slow # down if too many structures are provided and a fast machine may # generate structures too quickly for PyMOL to read, the # "Buffer clean up" message # uncomment the line below to use PyMOL_Observer ## AddPyMOLObserver(test_pose, True) # 10. create ScoreFunctions # for low-resolution, centroid, poses necessary for the TrialMover's # MonteCarlo object (see below) scorefxn_low = create_score_function('score3') # for high-resolution, fullatom, poses necessary for scoring final output # from the PyJobDistributor (see below) scorefxn_high = get_fa_scorefxn() # create_score_function('standard', 'score12') # 11. setup a RepeatMover on a TrialMover of a SequenceMover # -setup a TrialMover # a. create a SequenceMover of the fragment insertions #### add any other moves you desire folding_mover = protocols.moves.SequenceMover() folding_mover.add_mover(insert_long_frag) folding_mover.add_mover(insert_short_frag) # b. create a MonteCarlo object to define success/failure # must reset the MonteCarlo object for each trajectory! mc = MonteCarlo(test_pose, scorefxn_low, kT) # c. create the TrialMover trial = TrialMover(folding_mover, mc) #### for each trajectory, try cycles number of applications # -create the RepeatMover folding = protocols.moves.RepeatMover(trial, cycles) # 12. create a (Py)JobDistributor jd = PyJobDistributor(job_output, jobs, scorefxn_high) # 13. store the score evaluations for output # printing the scores as they are produced would be difficult to read, # Rosetta produces a lot of verbose output when running scores = [0]*(jobs + 1) scores[0] = scorefxn_low(pose) # 14. perform folding by counter = 0 # for exporting to PyMOL while not jd.job_complete: # a. set necessary variables for the new trajectory # -reload the starting pose test_pose.assign(pose) # -change the pose's PDBInfo.name, for the PyMOL_Observer counter += 1 test_pose.pdb_info().name(job_output + '_' + str(counter)) # -reset the MonteCarlo object (sets lowest_score to that of test_pose) mc.reset(test_pose) #### if you create a custom protocol, you may have additional #### variables to reset, such as kT #### if you create a custom protocol, this section will most likely #### change, many protocols exist as single Movers or can be #### chained together in a sequence (see above) so you need #### only apply the final Mover # b. apply the refinement protocol folding.apply(test_pose) #### # c. export the lowest scoring decoy structure for this trajectory # -recover the lowest scoring decoy structure mc.recover_low(test_pose) # -store the final score for this trajectory scores[counter] = scorefxn_low(test_pose) # -convert the decoy to fullatom # the sidechain conformations will all be default, # normally, the decoys would NOT be converted to fullatom before # writing them to PDB (since a large number of trajectories would # be considered and their fullatom score are unnecessary) # here the fullatom mode is reproduced to make the output easier to # understand and manipulate, PyRosetta can load in PDB files of # centroid structures, however you must convert to fullatom for # nearly any other application to_fullatom.apply(test_pose) # -guess what cysteines are involved in disulfide bridges guess_disulfides(test_pose) # -output the fullatom decoy structure into a PDB file jd.output_decoy(test_pose) # -export the final structure to PyMOL test_pose.pdb_info().name(job_output + '_' + str(counter) + '_fa') #### if you want to see the decoy scores, uncomment the line below #scorefxn_high( test_pose ) # 15. output the score evaluations print( '===== Centroid Scores =====' ) print( 'Original Score\t:\t', scores[0] ) for i in range(1, len(scores)): # print out the job scores # the "[:14].ljust(14)" is to force the text alignment print( (job_output + '_' + str( i ))[:14].ljust(14) +\ '\t:\t', scores[i] ) return scores # for other protocols # locates all cysteines, make bonds between them, output the bonds that lower # the score def guess_disulfides(pose, cutoff = 6.0): """ A quick method for probing a protein for cysteine residues close to each other (within ) """ # find all cysteine residues and consider possible disulfides cys = [i for i in range(1, pose.total_residue() + 1) if pose.residue(i).name1() == 'C'] partners = [0]*sum( range(len(cys)) ) # all disulfides possible i = 0 # create all combinations for first in range(len(cys[:-1])): for second in cys[first + 1:]: partners[i] = (cys[first], second) i += 1 # try each disulfide, if it lowers the score, print it to screen print( '='*80 ) print( 'Potential Disulfides:' ) for pair in partners: # for a fullatom cysteine in PyRosetta, the 6th atom is sulfur separation = (pose.residue(pair[0]).xyz(6) - pose.residue(pair[1]).xyz(6)).norm if separation < cutoff: print( 'between (pose numbered) residue' , pair[0] , 'and' ,\ pair[1] , '|' , separation , 'Angstrom separation' ) print( '='*80 ) # to manipulate disulfide bonds, use: # formation: form_disulfide(pose.conformation(), 6, 16) # cleavage: change_cys_state(6, 'CYS', pose.conformation() ) # change_cys_state(16, 'CYS', pose.conformation() ) ################################################################################ # INTERPRETING RESULTS """ The (Py)JobDistributor will output the lowest scoring pose for each trajectory (as a PDB file), recording the score in .fasc. Generally, the decoy generated with the lowest score contains the best prediction for the protein conformation. In this case the scores may be misleading since the output PDB files were converted into fullatom mode. Centroid scores are written to the standard output (the screen) but not to file. If you need these scores, you must reload the poses, convert them to centroid, and rescore them. Using low sampling with an average acceptance (kT), many of the structures will appear incomplete (not fully folded) and unrealistic (clashes). Fragment insertion efficiently reproduces secondary structural elements in the proposed fold. If this method is followed by refinement, a higher kT may be useful since this will produce compact centroid folds that have clashes easily removed during refinement. Minimization steps can be confounded by clashes but this approach yields better fold predictions. For each trajectory, the PyMOLMover will export numerous intermediate centroid conformations and the final fullatom fold. Depending on the machine used, the structures may be produced too quickly for PyMOL to display properly. If this occurs, some structures will NOT load into PyMOL and a message will print in the PyMOL upper window indicating "Buffer clean up". Please change where the PyMOLMover is applied to view different output or change the method to include pauses (try the Python time module, specifically time.sleep). These alterations are tedious but provide an effective means for tuning the protocol parameters while maintaining biochemical feasibility (i.e. you can know if the fold is wrong). The method guess_disulfides prints out pairs of cysteine residues in the input pose which are within a certain distance (default 6 Angstroms). The method sample_folding calls guess_disulfides for the final decoy of each trajectory. To manipulate (fullatom) disulfide bonds in PyRosetta, use the commands: form_disulfide and change_cys_state (see the comments at the end of guess_disulfides). """ ################################################################################ # COMMANDLINE COMPATIBILITY # everything below is added to provide commandline usage, # the available options are specified below # this method: # 1. defines the available options # 2. loads in the commandline or default values # 3. calls sample_folding with these values # parser object for managing input options # all defaults are for the example using "test_in.pdb" with reduced # cycles/jobs to provide results quickly parser = optparse.OptionParser() parser.add_option( '--pdb_filename', dest = 'pdb_filename', default = '../test/data/test_in.pdb', # default example PDB help = 'the PDB file containing the protein to fold') parser.add_option('--fasta_filename', dest = 'fasta_filename', default = '', # default empty! help = 'the FASTA file containing the protein to sequence to fold') parser.add_option( '--sequence', dest = 'sequence', default = '', # default empty! help = 'the protein sequence to fold') # the fragment files options parser.add_option('--long_frag_filename', dest = 'long_frag_filename', default = '../test/data/test9_fragments', # specific to each PDB (test_in.pdb here) help = 'the file of long fragments corresponding to the sequence') parser.add_option('--long_frag_length', dest = 'long_frag_length', default = '9', # must match the long_frag_filename help = 'the length of fragments contained in the long_frag_file') parser.add_option('--short_frag_filename', dest = 'short_frag_filename', default = '../test/data/test3_fragments', # specific to each PDB (test_in.pdb here) help = 'the file of short fragments corresponding to the sequence') parser.add_option( '--short_frag_length', dest = 'short_frag_length', default = '3', # must match the short_frag_filename help = 'the length of fragments contained in the short_frag_file') # folding protocol options parser.add_option('--kT', dest='kT', default = '3.0', # default higher for easy demonstration help = 'the \"temperature\" of the sample folding protocol') parser.add_option('--long_inserts', dest='long_inserts', default = '1', help = 'the number of times a long fragment insertion is applied per\ cycle in the sample folding protocol') parser.add_option('--short_inserts', dest='short_inserts', default = '3', help = 'the number of times a short fragment insertion is applied per\ cycle in the sample folding protocol') parser.add_option('--cycles', dest='cycles', default = '40' , help = 'the number of folding rounds (long and short fragment\ insertions) in the sample folding protocol') # PyJobDistributor options parser.add_option('--jobs', dest='jobs', default = '1', # default to single trajectory for speed help = 'the number of jobs (trajectories) to perform' ) parser.add_option('--job_output', dest = 'job_output', default = 'fold_output', # if a specific output name is desired help = 'the name preceding all output, output PDB files and .fasc') (options,args) = parser.parse_args() # the user may input a PDB file, fasta file, or sequence directly # PDB file option pose = Pose() # Fasta file option fasta_filename = options.fasta_filename if fasta_filename: # defaults to off, empty string f = open(fasta_filename, 'r') # open the file sequence = f.readlines() # read the text f.close() # close it # removing the trailing "\n" and any header lines sequence = [line.strip() for line in sequence if not '>' in line] sequence = ''.join( sequence ) # combine into a single sequence elif options.sequence: sequence = options.sequence else: pdb_filename = options.pdb_filename; #Default is the test PDB, not an empty string. pose_from_file(pose, pdb_filename) sequence = pose.sequence() #Checks for the sequence in a fasta, then direct, and finally from a PDB file. If no PDB file is given, it will load the default. # fragment files options long_frag_filename = options.long_frag_filename long_frag_length = int(options.long_frag_length) short_frag_filename = options.short_frag_filename short_frag_length = int(options.short_frag_length) # folding protocol options kT = float(options.kT) long_inserts = int(options.long_inserts) short_inserts = int(options.short_inserts) cycles = int(options.cycles) # PyJobDistributor options jobs = int(options.jobs) job_output = options.job_output # ''' Disabled until Abinitio error is fixed sample_folding(sequence, long_frag_filename, long_frag_length, short_frag_filename, short_frag_length, kT, long_inserts, short_inserts, cycles, jobs, job_output) ################################################################################ # ALTERNATE SCENARIOS ##################### # Obtaining Sequences """ PDB files are essential to all Rosetta applications. Occasionally protein sequences are required instead. In many cases, only proteins with solved crystal structures are used and as such, sequences, as with PDB files, are obtained from the RCSB website. To obtain the protein sequence from a PDB file: 1) locate your protein of interest at http://www.pdb.prg/ 2) download the PDB file, using a browser this includes: a. clicking "Download Files" on the upper right b. clicking "FASTA Sequence", the second option """ ####################################### # FASTA Files, PDB Files, and Biopython """ The FASTA file can be input to this method using the --fasta_filename option. Most FASTA files, including those obtained from RCSB, will start with a header line staring with the ">" character. FASTA files from RCSB may contain multiplt chains. Rosetta is designed to predict small globular domain folds and will not work effectively with multiple chains. Using PyRosetta, a PDB file protein sequence can be extracted from a Pose object using the method Pose.sequence. Biopython is an effective tool for extracting information from biological data files, including FASTA files and PDB files. Since it is in Python, it is a very useful tool for combining PyRosetta structural analysis and protocols with sequence level analysis or quick calculations. """ ################ # A Real Example """ All of the default variables and parameters used above are specific to the example with "test_in.pdb", which is supposed to be simple, straightforward, and speedy. Here is a more practical example: Pheromone ER-23 is a small protein (51 amino acids) with 5 disulfide bridges. The formation of these bonds may significantly effect folding. Suppose you are interested in the significance of these disulfide bonds as folding constraints and want to predict the protein without them using PyRosetta. 1. Obtain the protein sequence of RCSB PDB 1HA8 (instructions above) (for the example here, download the PDB file and ensure it is clean) 2. Make two fragment files using the "Generate Fragment Files" (instructions above) -of 9-mers (the long fragments) -of 3-mers (the short fragments) 3. Make a directory containing: -the PDB file for 1HA8 (cleaned of HETATMs and waters) lets name it "1HA8.clean.pdb" here -the 9-mer fragment file for 1HA8 lets name it "1HA8.frag9" here -the 3-mer fragment file for 1HA8 lets name it "1HA8.frag3" here -this sample script (technically not required, but otherwise the commands in 4. would change since folding.py would not be here) 4. Run the script from the commandline with appropriate arguments: >python folding.py --pdb_filename 1HA8.clean.pdb --long_frag_filename 1HA8.frag9 --long_frag_length 9 --short_frag_filename 1HA8.frag3 --short_frag_length 3 --jobs 100 --job_output 1HA8_folding_output --kT 1.0 --long_inserts 1 --short_inserts 3 --cycles 200 --disulfides 1 -The long_frag_length MUST match long_frag_filename (9 in this example) -The short_frag_length MUST match short_frag_filename (3 here) -100 trajectories is low, sampling protein conformations requires many trials, typically hundreds (800-1000) trajectories are attempted -The "temperature" parameter, kT, is set to 1.0, a neutral value, larger kT increase the diversity of sampling (easily escape local minima) but requires compensation with more trajectories smaller kT decreases the chance of sampling "useless" space but cannot easily escape local minima (the default kT = 3.0 -There are no common values for long_inserts, short_inserts, or cycles more inserts indicates greater sampling with fragments while greater cycles produces more sampling in general (though in this case only fragment insertion is performed) however, only one selection (Metropolis Criteria) is applied per cycle, thus a proposed structure with too many fragments inserted is likely to be rejected but additional trials increases the computation time noticeably here, as is typical is Rosetta, a fairly conservative change is proposed (1 9-mer and 3 3-mers) per cycle with many cycles 5. Wait for output, this will take a while (performing 100 trajectories of loop modeling involving 1*3*200 total moves per trajectory) 6. Analyze the results (see INTERPRETING RESULTS above) Note: this is NOT intended to be used for realistic centroid folding, it merely provides a "skeleton" for the code in PyRosetta. It may be useful for preliminary investigation but the best protocols are somewhat protein-specific, there is no current universal folding method. As mentioned above, this script only touches the low-resolution aspect of Rosetta folding and must be accompanied by a high-resolution step to yield complete results. Generally, a protocol similar to the one presented here with more drastic sampling and a larger number of trials should be sufficient to find realistic folds (after refinement). """ ############################## # Changing Folding Sampling """ Rosetta is a structure prediction tool and all predictions rely on the techniques (moves) used to generate proposed structures. There is NO GENERAL SOLUTION for predicting a protein fold from scratch and fragment insertion is an effective way of sampling the relevant space. Since fragments are sequence and secondary structure (prediction) dependent, the resultant folds are biochemically feasible and in accordance with our prior observations about protein stucture (the PDB). For speed, Rosetta folding (and the folding algorithm presented here) is performed in low-resolution (centroid) mode. This method of folding is abstractly looking for structures which obey certain size constraints (i.e. the rg score term) but do not unrealistically clash (i.e. the vdw score term) that are achieved easily (i.e. fragment insertion, a fast algorithm). Sufficient sampling and downstream refinement will convert these general folds (the output of this script) into reasonable estimates of the native protein structure. Many other sampling techniques and methods are available. PyRosetta was designed to make novel investigation of these ideas easy. Recommend literature include topics in protein structure prediction, Monte Carlo Markov Chain processes, Rosetta protocols, and simulated annealing algorithms. When developing custom protocols, you MUST understand the Movers available in PyRosetta (see the other sample scripts, specifically refinement.py). Unfortunately, you cannot build novel Mover classes in PyRosetta. Fortunately, it is easy to implement similar\ code on your own once you understand how moves and selection are performed. Please try alternate sampling methods to better understand how these algorithms perform and to find what moves best suite your problem. """ ############################# # Changing Folding Scoring """ The Rosetta "force-field" is a biasing function (see MCMC algorithms) used to predict protein structures based on what is observed about other protein structures. These scoring methods supply the constraints necessary to bias structure prediction but incorporate a LARGE number of fine tuned parameters. There is no easy way to optimize all relevant constants and performance has been increased by balancing speed and accuracy. Many other relevant metrics exist for protein structures and discriminating decoys (incorrect predictions) from actual proteins is an ongoing task. Please experiment with different ScoreTypes in PyRosetta, their weights, and the combination of terms. Unfortunately, you cannot build novel ScoreTypes (scoring methods) in PyRosetta. Fortunately, it is easy to implement similar code on your own once you understand how scoring biases MCMC simulations. Please try alternate scoring functions or unique selection methods to better understand which scoring terms contribute to performance and to find what scoring best suites your problem. """