{"payload":{"feedbackUrl":"https://github.com/orgs/community/discussions/53140","repo":{"id":55501615,"defaultBranch":"master","name":"rfam-production","ownerLogin":"Rfam","currentUserCanPush":false,"isFork":false,"isEmpty":false,"createdAt":"2016-04-05T11:16:05.000Z","ownerAvatar":"https://avatars.githubusercontent.com/u/15381791?v=4","public":true,"private":false,"isOrgOwned":true},"refInfo":{"name":"","listCacheKey":"v0:1726689558.0","currentOid":""},"activityList":{"items":[{"before":null,"after":"c47db591ecad858259977582c089397dce0f24dc","ref":"refs/heads/rework-text-search","pushedAt":"2024-09-18T19:59:18.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"First work on updating text search\n\nThe full region export takes way too much memory and way too long\n(<100GB of memory and hours of time). This is work to fix that. The\nqueries can be improved and the pipeline speed up. This starts with\nreducing the amount of data fetched, parallelizing the queries and using\ncaching for values that should just be cached.","shortMessageHtmlLink":"First work on updating text search"}},{"before":"14afc1d062eb0070ad2cb67ec5589fee6361a734","after":"617dd771fb1bed061f7ec0ba2919a98ec34d5687","ref":"refs/heads/master","pushedAt":"2024-09-02T08:00:47.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Split '_' in names\n\nSeems that in the text search the \"_\" is treated as a word character.\nThis means we don't get the sort of partial matches we would like. This\nshould fix that, but may require some tweaking.","shortMessageHtmlLink":"Split '_' in names"}},{"before":"8098752bff11d20a3479a71b1b794576cd975b1d","after":null,"ref":"refs/heads/genome-pipeline","pushedAt":"2024-08-30T15:10:40.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"}},{"before":"6279d92db69d2547b5e1f9d0aad7d81de00706db","after":"14afc1d062eb0070ad2cb67ec5589fee6361a734","ref":"refs/heads/master","pushedAt":"2024-08-30T15:10:13.000Z","pushType":"pr_merge","commitsCount":258,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Merge pull request #175 from Rfam/genome-pipeline\n\nGenome pipeline","shortMessageHtmlLink":"Merge pull request #175 from Rfam/genome-pipeline"}},{"before":"4fc863489173821c66c75b22f57694dc85aba3ef","after":"8098752bff11d20a3479a71b1b794576cd975b1d","ref":"refs/heads/genome-pipeline","pushedAt":"2024-08-30T15:08:21.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Various fixes for taxonomy loading","shortMessageHtmlLink":"Various fixes for taxonomy loading"}},{"before":"9faf739e02b81d33bf52aa1f2c9c3ab0defb5310","after":"4fc863489173821c66c75b22f57694dc85aba3ef","ref":"refs/heads/genome-pipeline","pushedAt":"2024-06-24T12:12:26.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Fix Rfamseq configuration generation\n\nTemplate file was incorrect and this didn't actually use any defined\nvalues.","shortMessageHtmlLink":"Fix Rfamseq configuration generation"}},{"before":"1740c07a03b56c7f16bbf19b9594d293d43ee84d","after":"9faf739e02b81d33bf52aa1f2c9c3ab0defb5310","ref":"refs/heads/genome-pipeline","pushedAt":"2024-06-24T10:12:06.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Correct output filenames","shortMessageHtmlLink":"Correct output filenames"}},{"before":"d18cf18e29848c9b27b258a1e27282defa5cd889","after":"1740c07a03b56c7f16bbf19b9594d293d43ee84d","ref":"refs/heads/genome-pipeline","pushedAt":"2024-06-24T09:37:57.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Try to fix config building issue\n\nThis should fix the config building issue, it no longer uses the same\nprocess twice, which should be the cause. At least it no longer errors\nout when dry run.","shortMessageHtmlLink":"Try to fix config building issue"}},{"before":"1aa67c5bf17db2c839119506fc764ed9e4300907","after":"d18cf18e29848c9b27b258a1e27282defa5cd889","ref":"refs/heads/genome-pipeline","pushedAt":"2024-06-18T09:03:09.000Z","pushType":"push","commitsCount":3,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Minor naming tweaks","shortMessageHtmlLink":"Minor naming tweaks"}},{"before":"8e2d6d974f9b614630d0e477ec68820a7f04e5e0","after":"1aa67c5bf17db2c839119506fc764ed9e4300907","ref":"refs/heads/genome-pipeline","pushedAt":"2024-06-18T08:50:45.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Try to fix conflicting names\n\nI think I am accidentally overwriting the merged sequences with a chunk\nsince they have similar naming patterns. This moves the merged sequences\nto their own pattern, which hopefully prevents it from being\noverwritten.","shortMessageHtmlLink":"Try to fix conflicting names"}},{"before":"ed82aae3ffbc329413b35b4563acf79f18c80771","after":"a6bfdc6241e4d6bb6da314ab6252e05b45bbf82c","ref":"refs/heads/improving-indexing","pushedAt":"2024-06-17T12:11:22.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Reformat with black and index previous names\n\nThese are ignored but are sometimes useful so we should try to index\nthem as well.","shortMessageHtmlLink":"Reformat with black and index previous names"}},{"before":null,"after":"ed82aae3ffbc329413b35b4563acf79f18c80771","ref":"refs/heads/improving-indexing","pushedAt":"2024-06-17T10:13:54.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Convert _ to ' ' for indexing\n\nI think this will allow better results but we will need to test out the\ndifferences once this is indexed.","shortMessageHtmlLink":"Convert _ to ' ' for indexing"}},{"before":"ea64a5a8f71fee62d2a65584f50dfc8cbaa7140c","after":"8e2d6d974f9b614630d0e477ec68820a7f04e5e0","ref":"refs/heads/genome-pipeline","pushedAt":"2024-03-11T15:58:18.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Add initial documentation for the pipeline\n\nThis covers basics of the pipeline, how to run it, what it does, and\nsome development and implementation notes.","shortMessageHtmlLink":"Add initial documentation for the pipeline"}},{"before":"81affa12de560b560592123c2ac847505e5ccb27","after":"ea64a5a8f71fee62d2a65584f50dfc8cbaa7140c","ref":"refs/heads/genome-pipeline","pushedAt":"2024-03-10T14:23:01.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Start fixing metadata writing\n\nThis writes some temporary JSON data that can be used to build the final\nmetadata entries.","shortMessageHtmlLink":"Start fixing metadata writing"}},{"before":"5e4dcd64d849a9ea33302224e3890187d422f9af","after":"81affa12de560b560592123c2ac847505e5ccb27","ref":"refs/heads/genome-pipeline","pushedAt":"2024-03-08T16:30:02.000Z","pushType":"push","commitsCount":4,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Try to fix some issues in building database\n\nThe database needs some tweaks before it is properly built. Notably this\nneeds to follow a correct naming system, generate the correct values for\na config (the database size) and create the config file. This should do\nmore or less everything that is required to build the final data.","shortMessageHtmlLink":"Try to fix some issues in building database"}},{"before":"65a88d494f1e36dedc48eeaf406397ecbe4be10b","after":"5e4dcd64d849a9ea33302224e3890187d422f9af","ref":"refs/heads/genome-pipeline","pushedAt":"2024-03-06T15:21:56.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Improve sequence chunking\n\nThis should be a much faster way to chunk the sequences. The previous\nmethod used a perl script from elsewhere in Rfam, but I much prefer to\nkeep everything self-contained and in as few languages as possible.\n\nThe new method works by splitting the sequences in a seqstat file into\nchunks of roughly the given size. This is done by using the sequence\nlength in the file as an estimate of it size, that should be safe,\nprobably. These chunks are then used as input to esl-sfetch to pull out\nthe given sequences and build the target chunks.","shortMessageHtmlLink":"Improve sequence chunking"}},{"before":"c00b6c4e4842dd7784f198462080fd0c726f65eb","after":"65a88d494f1e36dedc48eeaf406397ecbe4be10b","ref":"refs/heads/genome-pipeline","pushedAt":"2024-03-05T16:12:27.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Initial testing of MGnify filtering\n\nThis can filter out MAGs using simple criteria for adding MGnify data to\nRfamseq. This just checks that filtering works reasonably and probably\nisn't the final version.","shortMessageHtmlLink":"Initial testing of MGnify filtering"}},{"before":"0649e0016adbe804ac46baf71bf473fbc529fde5","after":"c00b6c4e4842dd7784f198462080fd0c726f65eb","ref":"refs/heads/genome-pipeline","pushedAt":"2024-03-04T19:16:03.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Correct filenames in shuffling","shortMessageHtmlLink":"Correct filenames in shuffling"}},{"before":"32db8e8d696c1044dc6dfacf6a06e0c7b3291e11","after":"0649e0016adbe804ac46baf71bf473fbc529fde5","ref":"refs/heads/genome-pipeline","pushedAt":"2024-03-04T19:09:07.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Use the correct converter","shortMessageHtmlLink":"Use the correct converter"}},{"before":"0d4d3c516ae1be5d3c9b7deecc1729550ac3ab39","after":"32db8e8d696c1044dc6dfacf6a06e0c7b3291e11","ref":"refs/heads/genome-pipeline","pushedAt":"2024-03-04T19:05:25.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"More logging","shortMessageHtmlLink":"More logging"}},{"before":"a5a3187b475cb6f0451d6e6722d8246bd423fea5","after":"0d4d3c516ae1be5d3c9b7deecc1729550ac3ab39","ref":"refs/heads/genome-pipeline","pushedAt":"2024-03-04T19:00:23.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Make merging proteome strip easier to debug\n\nDistinct files is often easier to work with.","shortMessageHtmlLink":"Make merging proteome strip easier to debug"}},{"before":"d74dd4efb43a5a5e1c8a3a007565a21b45db3b94","after":"a5a3187b475cb6f0451d6e6722d8246bd423fea5","ref":"refs/heads/genome-pipeline","pushedAt":"2024-03-04T18:49:21.000Z","pushType":"push","commitsCount":4,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Add more viral genomes","shortMessageHtmlLink":"Add more viral genomes"}},{"before":"ea6852a86a2f143193267a7423b9738f3e818bd5","after":"d74dd4efb43a5a5e1c8a3a007565a21b45db3b94","ref":"refs/heads/genome-pipeline","pushedAt":"2024-03-04T12:49:06.000Z","pushType":"push","commitsCount":3,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Fixes and updates","shortMessageHtmlLink":"Fixes and updates"}},{"before":"56bc1b2484f525beed72f18fc35cc82ed61a2469","after":"ea6852a86a2f143193267a7423b9738f3e818bd5","ref":"refs/heads/genome-pipeline","pushedAt":"2024-03-04T11:06:02.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Incomplete rewrite\n\nThis rewrites much of the logic of the pipeline. This now works very\ndifferently.\n\nFirst, it uses the JSON api from Uniprot. This is much easier to work\nwith. It also use models the data from the API and does not create new\ncustom data structures, for better or worse. Because of this all uniprot\ntests are broken currently.\n\nSecondly, this adds wrappers to ENA and NCBI to figure out how to fetch\ndata for them. The wrappers do not actually fetch data, simply say where\nit may be found.\n\nNext this reworks the downloader to use cleaner but different logic.\nGiven something which is a GCA or GCF it will simply download the entire\ngenome and not check the components. This could be risky but should be\nfine. We can validate if the genome is as expected later.\n\nThe fallback logic is changed to be clearer and not use as many\nexception handlers, hopefully it should be easier to follow now.\n\nIt can do a better job handling outdated genomes, in that it will always\ntry to use the latest. This is generally safe, I hope.\n\nIt also can fetch custom proteomes, not just the reference ones. This is\nbecause we want to add additional viral genomes, as we have poor\ncoverage there. This adds some logic to the pipeline and the python code\nto fetch and deduplicate such entries. We use the data in PIR to select\nwhich additional proteomes to add.\n\nIt also does not fail if any one proteome download fails, instead this\nis written to a failure file, so they can be inspected later.\n\nThis is not yet incomplete in that it likely does not handle WGS sets,\nand does not yet produce metadata. However, it is far enough along that\nit is worth running on the entire dataset and seeing what happens. The\nfailure cases can then be inspected. Also, there is a considerable about\nof duplicated, unneeded and likely incorrect code that will have to be\ncleaned up once the core logic is figured out. This is only being\ncommitted to see what parts need to be kept.","shortMessageHtmlLink":"Incomplete rewrite"}},{"before":"a4f85229fb2c29b9243b394ef709ea7672e00eca","after":"56bc1b2484f525beed72f18fc35cc82ed61a2469","ref":"refs/heads/genome-pipeline","pushedAt":"2024-02-27T14:04:22.000Z","pushType":"push","commitsCount":6,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Try to improve usability of genome downloader\n\nThis adds some docs and general improvements. Things which should be\nprivate are and there are more docs about how it works. Still lots to\ndo.","shortMessageHtmlLink":"Try to improve usability of genome downloader"}},{"before":"b5c6925208a13b92a7b014428e442359ced26908","after":"a4f85229fb2c29b9243b394ef709ea7672e00eca","ref":"refs/heads/genome-pipeline","pushedAt":"2024-02-23T12:10:10.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Use all sequences for GCA/GCF\n\nAs the comment explains this is a major change to how we fetch\nsequences. However, this should increase the reliability quite a bit.\nWhich is the major concern of the current code.\n\nThe most concerning effects of this are.\n\n1. This could miss sequences. This can happen if the component is not a\n member of the genome. This is very odd and I haven't seen it. The\n closest I've seen is using things which are more or less assembled,\n which isn't ideal but hopefully this is close enough.\n2. This could use the wrong identifiers for a genome. As an example, the\n components could be requesting a WGS set. While those sequences are\n part of the genome, the ids used are not the WGS ids but instead\n different ones. This could be an issue for people mapping from Rfam\n results to UniProt results.\n\nSome other minor effects are.\n\n1. We will have too many sequences. Basically UniProt may exclude some\n partially assembled sequences we use. That isn't really a problem in\n my view.\n\nOverall I think this should be done because it should reduce the failure\nmodes a lot. Making this reliable is more important than ensure ids are\nexactly the same and missing sequences seems unlikely.","shortMessageHtmlLink":"Use all sequences for GCA/GCF"}},{"before":"afd0213c55266740267b3e0e7fe5fe87b45117a4","after":"b5c6925208a13b92a7b014428e442359ced26908","ref":"refs/heads/genome-pipeline","pushedAt":"2024-02-22T17:49:55.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Normalize genome source case\n\nThis should make it more robust to trivial changes. It seems UniProt has\nchanged how they write out the genome sources.","shortMessageHtmlLink":"Normalize genome source case"}},{"before":"020fd28c3a561b512cc047bb39c657859530b49e","after":null,"ref":"refs/heads/microrna-slurm","pushedAt":"2024-02-21T16:20:11.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"}},{"before":"8aa30ca5df15560960a4b4127ee3338286455bd2","after":"6279d92db69d2547b5e1f9d0aad7d81de00706db","ref":"refs/heads/master","pushedAt":"2024-02-21T16:20:08.000Z","pushType":"pr_merge","commitsCount":4,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"},"commit":{"message":"Merge pull request #174 from Rfam/microrna-slurm\n\nMicrorna slurm","shortMessageHtmlLink":"Merge pull request #174 from Rfam/microrna-slurm"}},{"before":"c41cac1bd5550af79ea746d638e36770dffcf879","after":null,"ref":"refs/heads/rfam2rnacentral","pushedAt":"2024-02-21T16:17:49.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"blakesweeney","name":"Blake Sweeney","path":"/blakesweeney","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/330875?s=80&v=4"}}],"hasNextPage":true,"hasPreviousPage":false,"activityType":"all","actor":null,"timePeriod":"all","sort":"DESC","perPage":30,"cursor":"Y3Vyc29yOnYyOpK7MjAyNC0wOS0xOFQxOTo1OToxOC4wMDAwMDBazwAAAAS6HMi7","startCursor":"Y3Vyc29yOnYyOpK7MjAyNC0wOS0xOFQxOTo1OToxOC4wMDAwMDBazwAAAAS6HMi7","endCursor":"Y3Vyc29yOnYyOpK7MjAyNC0wMi0yMVQxNjoxNzo0OS4wMDAwMDBazwAAAAQBDB8t"}},"title":"Activity ยท Rfam/rfam-production"}