PacBio-subreads-processing/pacbio-subreads-processing.wdl at develop · biowdl/PacBio-subreads-processing · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
version 1.0

# Copyright (c) 2020 Leiden University Medical Center
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import "tasks/ccs.wdl" as ccs
import "tasks/fastqc.wdl" as fastqc
import "tasks/isoseq3.wdl" as isoseq3
import "tasks/lima.wdl" as lima
import "tasks/multiqc.wdl" as multiqc
import "tasks/pacbio.wdl" as pacbio
import "tasks/pbbam.wdl" as pbbam
import "tasks/samtools.wdl" as samtools

workflow SubreadsProcessing {
    input {
        File subreadsFile
        File barcodesFasta
        String libraryDesign = "same"
        String outputDirectory = "."
        Int ccsChunks = 2
        Boolean ccsMode = true
        Boolean splitBamNamed = true
        Boolean runIsoseq3Refine = false
        Boolean generateFastq = false

        File? subreadsIndexFile

        Int limaThreads = 2
        Int ccsThreads = 2
        Int fastqcThreads = 4
        Map[String, String] dockerImages = {
            "biowdl-input-converter": "quay.io/biocontainers/biowdl-input-converter:0.3.0--pyhdfd78af_0",
            "ccs": "quay.io/biocontainers/pbccs:6.0.0--h9ee0642_2",
            "fastqc": "quay.io/biocontainers/fastqc:0.11.9--hdfd78af_1",
            "isoseq3": "quay.io/biocontainers/isoseq3:3.4.0--0",
            "lima": "quay.io/biocontainers/lima:2.2.0--h9ee0642_0",
            "python3": "python:3.7-slim",
            "multiqc": "quay.io/biocontainers/multiqc:1.10.1--pyhdfd78af_1",
            "pbbam": "quay.io/biocontainers/pbbam:1.6.0--h058f120_1",
            "samtools": "quay.io/biocontainers/samtools:1.12--h9aed4be_1",
            "pacbio-merge": "quay.io/redmar_van_den_berg/pacbio-merge:0.2"
        }
    }

    meta {
        allowNestedInputs: true
    }

    # The name of the subreads, to be used to determine output filenames.
    File subreadsName = basename(subreadsFile, ".subreads.bam")

    # Get the CCS chunks.
    call pacbio.ccsChunks as createChunks {
        input:
            chunkCount = ccsChunks,
            dockerImage = dockerImages["python3"]
    }

    # Index the input bamfile.
    if (!defined(subreadsIndexFile)) {
        call pbbam.Index as pbIndex {
            input:
                bamFile = subreadsFile,
                dockerImage = dockerImages["pbbam"]
        }
    }

    File subreadsIndex = select_first([pbIndex.index, subreadsIndexFile])
    File subreadsBam = select_first([pbIndex.indexedBam, subreadsFile])

    scatter (chunk in createChunks.chunks) {
        # Convert chunk from 1/10 to 1 to determine output filename.
        String chunkNumber = sub(chunk, "/.*$", "")

        call ccs.CCS as ccs {
            input:
                subreadsFile = subreadsBam,
                subreadsIndexFile = subreadsIndex,
                outputPrefix = outputDirectory + "/ccs/" + subreadsName + ".chunk." + chunkNumber,
                threads = ccsThreads,
                chunkString = chunk,
                dockerImage = dockerImages["ccs"]
        }
    }

    # Merge the bam files again.
    call samtools.Merge as mergeBams {
        input:
            bamFiles = ccs.ccsBam,
            outputBamPath = outputDirectory + "/ccs/" + subreadsName + ".merged.ccs.bam",
            dockerImage = dockerImages["samtools"]
    }

    # Merge the report for MultiQC.
    call pacbio.mergePacBio as mergeCCSReport {
        input:
            reports = ccs.ccsJsonReport,
            outputPathMergedReport =  outputDirectory + "/ccs/" + subreadsName + ".merged.ccs.report.json",
            dockerImage = dockerImages["pacbio-merge"]
    }

    call lima.Lima as lima {
        input:
            libraryDesign = libraryDesign,
            ccsMode = ccsMode,
            splitBamNamed = splitBamNamed,
            inputBamFile = mergeBams.outputBam,
            barcodeFile = barcodesFasta,
            outputPrefix = outputDirectory + "/lima/" + subreadsName,
            threads = limaThreads,
            dockerImage = dockerImages["lima"]
    }

    scatter (pair in zip(lima.limaBam, lima.limaBamIndex)) {
        File bamFile = pair.left
        File bamFileIndex = pair.right

        if (runIsoseq3Refine) {
            String refineOutputPrefix = sub(basename(bamFile, ".bam"), "fl", "flnc")
            call isoseq3.Refine as refine {
                input:
                    inputBamFile = bamFile,
                    inputBamIndex = bamFileIndex,
                    primerFile = barcodesFasta,
                    outputDir = outputDirectory + "/refine",
                    outputNamePrefix = refineOutputPrefix,
                    dockerImage = dockerImages["isoseq3"]
            }

            call fastqc.Fastqc as fastqcRefine {
                input:
                    seqFile = refine.refineBam,
                    outdirPath = outputDirectory + "/fastqc/" + basename(refine.refineBam, ".bam") + "-fastqc",
                    format = "bam",
                    threads = fastqcThreads,
                    dockerImage = dockerImages["fastqc"]
            }

            if (generateFastq) {
                call samtools.Fastq as bam2FastqRefine {
                    input:
                        inputBam = refine.refineBam,
                        outputRead1 = outputDirectory + "/fastq-files/" + basename(refine.refineBam, ".bam") + "_temp.fastq.gz",
                        outputRead0 = outputDirectory + "/fastq-files/" + basename(refine.refineBam, ".bam") + ".fastq.gz",
                        outputQuality = true,
                        compressionLevel = 1,
                        threads = 3,
                        dockerImage = dockerImages["samtools"]
                }
            }
        }

        if (!runIsoseq3Refine) {
            call fastqc.Fastqc as fastqcLima {
                input:
                    seqFile = bamFile,
                    outdirPath = outputDirectory + "/fastqc/" + basename(bamFile, ".bam") + "-fastqc",
                    format = "bam",
                    threads = fastqcThreads,
                    dockerImage = dockerImages["fastqc"]
            }

            if (generateFastq) {
                call samtools.Fastq as bam2FastqLima {
                    input:
                        inputBam = bamFile,
                        outputRead1 = outputDirectory + "/fastq-files/" + basename(bamFile, ".bam") + "_temp.fastq.gz",
                        outputRead0 = outputDirectory + "/fastq-files/" + basename(bamFile, ".bam") + ".fastq.gz",
                        outputQuality = true,
                        compressionLevel = 1,
                        threads = 3,
                        dockerImage = dockerImages["samtools"]
                }
            }
        }

        File fastqcHtmlReport = select_first([fastqcRefine.htmlReport, fastqcLima.htmlReport])
        File fastqcZipReport = select_first([fastqcRefine.reportZip, fastqcLima.reportZip])

        # Determine the sample name from the bam file name. This is needed
        # because the sample names are determine from the headers in the
        # fasta file, which is not accessible from the WDL.
        String sampleName = sub(sub(bamFile, ".*--", ""),".bam", "")

        if (generateFastq) {
            File? fastqFile = select_first([bam2FastqRefine.read0, bam2FastqLima.read0])
            File? extraFastqFile = select_first([bam2FastqRefine.read1, bam2FastqLima.read1])
        }
    }

    Array[File] qualityReports = flatten([fastqcHtmlReport, fastqcZipReport])

    call multiqc.MultiQC as multiqcTask {
        input:
            reports = qualityReports,
            outDir = outputDirectory + "/multiqc",
            dataDir = true,
            dockerImage = dockerImages["multiqc"]
    }

    output {
        File ccsReadsMerge = mergeBams.outputBam
        File ccsIndexMerge = mergeBams.outputBamIndex
        File ccsReportMerge = mergeCCSReport.outputMergedReport
        Array[File] ccsReport = ccs.ccsReport
        Array[File] ccsJsonReport = ccs.ccsJsonReport
        Array[File] ccsStderr = ccs.ccsStderr
        Array[File] zmwMetrics = ccs.zmwMetrics
        Array[File] limaReads = lima.limaBam
        Array[File] limaIndex = lima.limaBamIndex
        Array[File] limaSubreadset = lima.limaXml
        File limaStderr = lima.limaStderr
        File limaJson = lima.limaJson
        File limaCounts = lima.limaCounts
        File limaReport = lima.limaReport
        File limaSummary = lima.limaSummary
        Array[String] samples = sampleName
        Array[File] workflowReports = qualityReports
        File multiqcReport = multiqcTask.multiqcReport
        File? multiqcZip = multiqcTask.multiqcDataDirZip
        Array[File?] fastqFiles = fastqFile
        Array[File?] extraFastqFiles = extraFastqFile
        Array[File?] refineReads = refine.refineBam
        Array[File?] refineIndex = refine.refineBamIndex
        Array[File?] refineConsensusReadset = refine.refineConsensusReadset
        Array[File?] refineSummary = refine.refineFilterSummary
        Array[File?] refineReport = refine.refineReport
        Array[File?] refineStderr = refine.refineStderr
    }

    parameter_meta {
        # inputs
        subreadsFile: {description: "The PacBio subreads file that contains the raw PacBio reads.", category: "common"}
        barcodesFasta: {description: "Fasta file with the barcodes used in the PacBio experiment.", category: "common"}
        libraryDesign: {description: "Barcode structure of the library design.", category: "advanced"}
        ccsMode: {description: "Ccs mode, use optimal alignment options.", category: "advanced"}
        splitBamNamed: {description: "Split bam file(s) by resolved barcode pair name.", category: "advanced"}
        runIsoseq3Refine: {description: "Run isoseq3 refine for de-novo transcript reconstruction. Do not set this to true when analysing dna reads.", category: "advanced"}
        ccsChunks: {description: "The number of chunks to be used by ccs.", category: "advanced"}
        generateFastq: {description: "Generate fastq files from demultiplexed bam files.", category: "common"}
        outputDirectory: {description: "The directory in which the output files will be put.", category: "common"}
        subreadsIndexFile: {description: ".pbi file for the subreadsFile. If not specified, the subreadsFile will be indexed automatically." , category: "advanced"}
        limaThreads: {description: "The number of CPU threads to be used by lima.", category: "advanced"}
        ccsThreads: {description: "The number of CPU threads to be used by ccs.", category: "advanced"}
        fastqcThreads: {description: "The number of CPU threads to be used by fastQC.", category: "advanced"}
        dockerImages: {description: "The docker image(s) used for this workflow. Changing this may result in errors which the developers may choose not to address.", category: "advanced"}

        # outputs
        ccsReadsMerge: {description: "Merged consensus reads file(s)."}
        ccsIndexMerge: {description: "Merged index of consensus reads file(s)."}
        ccsReportMerge: {description: "Merged ccs report file."}
        ccsReport: {description: "Ccs report file(s)."}
        ccsJsonReport: {description: "Ccs results json report file(s)."}
        ccsStderr: {description: "Ccs stderr log file(s)."}
        zmwMetrics: {description: "ZMW metrics json file(s)."}
        limaReads: {description: "Demultiplexed reads file(s)."}
        limaIndex: {description: "Index of demultiplexed reads file(s)."}
        limaSubreadset: {description: "Xml file of the subreadset(s)."}
        limaStderr: {description: "Lima stderr log file(s)."}
        limaJson: {description: "Lima json file(s)."}
        limaCounts: {description: "Lima counts file(s)."}
        limaReport: {description: "Lima report file(s)."}
        limaSummary: {description: "Lima summary file(s)."}
        samples: {description: "The name(s) of the sample(s)."}
        workflowReports: {description: "A collection of all metrics."}
        fastqFiles: {description: "Fastq files extracted from bam files."}
        extraFastqFiles: {description: "Extra fastq files extracted from bam files."}
        multiqcReport: {description: "The multiqc html report."}
        multiqcZip: {description: "The multiqc data zip file."}
        refineReads: {description: "Filtered reads file(s)."}
        refineIndex: {description: "Index of filtered reads file(s)."}
        refineConsensusReadset: {description: "Refine consensus readset xml file(s)."}
        refineSummary: {description: "Refine summary file(s)."}
        refineReport: {description: "Refine report file(s)."}
        refineStderr: {description: "Refine stderr log file(s)."}
    }
}