miRNA分析

综述文章
标准数据
流程样例 & 实用工具+FASTX-Toolkit (v0.014)
- 程序

综述文章

microRNA相关的数据库与预测、功能分析软件
https://www.plob.org/article/1157.html

miRNA测序概述及实验分析流程
http://www.southgene.com/newsshow.php?cid=55&id=68

示范小RNA 项目结题报告
http://www.biotrainee.com/jmzeng/html_report/d/e/e/p/i/n/SmallRNA_result/index.html

标准数据

流程样例 & 实用工具+FASTX-Toolkit (v0.014)

FastQC (v0.11.8)
mulitiQC(v1.6)
miRDeep2(v2.0.0) (Anders和Huber，2010)
DESeq(v1.34.0) (Friedlander等，2012)
R(v3.5.1)

#合并
 
adapter="TCGATTCGTATGCCGTCTTCTGCTTG" 
inputfile="exp.fa"
index="/home/wj/data/gsm/hg"
outmap=$inputfile".mapper.fa"
outarf=$inputfile"_vs_genome.arf"
pre="hairpin.human.fasta"
mature="mature.human.fasta"
str="miRNA.str"
tail=$inputfile
hgfile="Homo_sapiens.GRCh38.fa"
type="hsa"
mapper.pl $inputfile -c -v -i -j -m -k $adapter -l 18 -p $index -n -s $outmap -t $outarf -o 16
quantifier.pl -p $pre -m $mature -r $outmap -t hsa -y $tail 
miRDeep2.pl $outmap  $hgfile $outarf $mature $other $pre -t $type

# 1. index 

bowtie-build hg.fa hg

# 2. mapper

mapper.pl 输入文件-e-h -i-j -m -k adapter -l 18 -p参考基因组的index -s处理过的reads输出的文件名-t输出文件名.arf

# 3. count

quantifier.pl -p前体序列参考文件.fa -m成熟序列参考文件.fa -r上一步处理过的reads输出文件-s从miRBase上下载的star序列文件-t物种名称-y文件名后缀

# 4. 鉴定测序得到的未知miRNA

miRDeep2.pl处理过的reads输出文件基因组文件处理过的输出文件名.arf成熟miRNA文件其他物种的成熟miRNA文件研究物种miRNA前体的文件-t物种名称2>report.log

# 5.  miRNA表达矩阵以及差异表达分析

程序

<代码>

#  "shell"## step1 : download raw data
echo {14..19} |sed 's/ /
/g' |while read id;  
do fastq-dump SRR15427$id ;
done
## step2 :  change sra data to fastq files## step3 : download the results from paper
mkdir paper_results && cd paper_results
wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE60nnn/GSE60292/suppl/GSE60292_RAW.tar
## tar xvf GSE60292_RAW.tar
ls *gz |while read id ; do (echo $id;zcat $id | cut -f 2 |perl -alne '{$t+=$_;}END{print $t}');done
ls *gz |xargs gunzip
## step4 : quality assessment
ls *fastq |while read id
do
echo $id
fastq_quality_filter -v -q 20 -p 80 -Q33  -i $id -o tmp ;
fastx_trimmer -v -f 1 -l 27 -i tmp  -Q33 -z -o ${id%%.*}_clean.fq.gz ;
done
rm tmp
## discarded 12%~~49%%
ls *_clean.fq.gz | while read id ; 
do  fastqc $id;
done
mkdir QC_results
mv *zip *html QC_results 
## step5 : alignment to miRBase v21 (hairpin.human.fa/mature.human.fa )
pathdata="/home/wj/data"
#### step5.1 using bowtie2 to do alignment
mkdir  bowtie2_index &&  cd bowtie2_index
bowtie2-build $pathdata/hairpin.human.fasta hairpin_human
bowtie2-build $pathdata/mature.human.fasta  mature_human
ls *_clean.fq.gz | while read id ; do  bowtie2 -x ./bowtie2_index/hairpin_human -U $id   -S ${id%%.*}.hairpin.sam ; done
ls *_clean.fq.gz | while read id ; do  bowtie2 -x ./bowtie2_index/mature_human  -U $id   -S ${id%%.*}.mature.sam ; done
## overall alignment rate:  10.20% / 5.71%/ 10.18%/ 4.36% / 10.02% / 4.95%  (before convert U to T )## overall alignment rate:  51.77% / 70.38%/51.45% /61.14%/ 52.20% / 65.85% (after convert U to T )## overall alignment rate:  6.67% / 3.78% / 6.70% / 2.80%/ 6.55% / 3.23%    (before convert U to T )## overall alignment rate:  34.94% / 46.16%/ 35.00%/ 38.50% / 35.46% /42.41%(after convert U to T )#### step5.2 using SHRiMP to do alignment##    http://compbio.cs.toronto.edu/shrimp/README##    3.5 Mapping cDNA reads against a miRNA database 
##　　We project the database with:
pathdata="/home/wj/data"
$SHRIMP_FOLDER/utils/project-db.py --shrimp-mode ls $pathdata/hairpin.human.fasta  $SHRIMP_FOLDER/bin/gmapper-ls -L  hairpin.human-ls SRR1542716.fastq  --qv-offset 33 -o 1 -H -E -a -1 -q -30 -g -30 --qv-offset 33 --strata -N 8  >map.out 2>map.log
## step6: counts the reads which mapping to each miRNA reference.## we need to exclude unmapped as well as multiple-mapped  reads## XS:i:<n> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode## NM:i:1   ## NM i Edit distance to the reference, including ambiguous bases but excluding clipping#The following command exclude unmapped (-F 4) as well as multiple-mapped (grep -v “XS:”) reads#samtools view -F 4 input.bam | grep -v "XS:" | wc -l## 180466//1520320##cat >count.hairpin.sh
ls *hairpin.sam  | while read id
do
samtools view  -SF 4 $id |perl -alne '{$h{$F[2]}++}END{print "$_	$h{$_}" foreach sort keys %h }'  > ${id%%_*}.hairpin.counts
done

## bash count.hairpin.sh##cat >count.mature.sh
ls *mature.sam  | while read id
do
samtools view  -SF 4 $id |perl -alne '{$h{$F[2]}++}END{print "$_	$h{$_}" foreach sort keys %h }'  > ${id%%_*}.mature.counts
done
## bash count.mature.sh

# R part 

## step8: differential expression analysis by R package for miRNA expression patterns:## 文章里面提到的结果是：## MicroRNA sequencing revealed over 250 known and 34 predicted novel miRNAs to be differentially expressed between ET-1 stimulated and unstimulated control hiPSC-CMs.## (FDR < 0.1 and 1.5 fold change)
rm(list=ls())
setwd('J:\miRNA_test\paper_results')  ##把从GEO里面下载的文献结果放在这里
sampleIDs=c()
groupList=c()
allFiles=list.files(pattern = '.txt')
i=allFiles[1]
sampleID=strsplit(i,"_")[[1]][1]
treat=strsplit(i,"_")[[1]][4]
dat=read.table(i,stringsAsFactors = F)
colnames(dat)=c('miRNA',sampleID)
groupList=c(groupList,treat)
for (i in allFiles[-1]){
sampleID=strsplit(i,"_")[[1]][1]
treat=strsplit(i,"_")[[1]][4]
a=read.table(i,stringsAsFactors = F)
colnames(a)=c('miRNA',sampleID)
dat=merge(dat,a,by='miRNA')
groupList=c(groupList,treat)
}
### 上面的代码只是为了把6个独立的表达文件给合并成一个表达矩阵## we need to filter the low expression level miRNA
exprSet=dat[,-1]
rownames(exprSet)=dat[,1]
suppressMessages(library(DESeq2))
exprSet=ceiling(exprSet)
(colData <- data.frame(row.names=colnames(exprSet), groupList=groupList))
## DESeq2就是这么简单的用
dds <- DESeqDataSetFromMatrix(countData = exprSet,
colData = colData,
design = ~ groupList)
dds <- DESeq(dds)
png("qc_dispersions.png", 1000, 1000, pointsize=20)
plotDispEsts(dds, main="Dispersion plot")
dev.off()
res <- results(dds)
## 画一些图，相当于做QC吧
png("RAWvsNORM.png")
rld <- rlogTransformation(dds)
exprSet_new=assay(rld)
par(cex = 0.7)
n.sample=ncol(exprSet)
if(n.sample>40) par(cex = 0.5)
cols <- rainbow(n.sample*1.2)
par(mfrow=c(2,2))
boxplot(exprSet,  col = cols,main="expression value",las=2)
boxplot(exprSet_new, col = cols,main="expression value",las=2)
hist(exprSet[,1])
hist(exprSet_new[,1])
dev.off()library(RColorBrewer)
(mycols <- brewer.pal(8, "Dark2")[1:length(unique(groupList))])
# Sample distance heatmap
sampleDists <- as.matrix(dist(t(exprSet_new)))
#install.packages("gplots",repos = "http://cran.us.r-project.org")library(gplots)
png("qc-heatmap-samples.png", w=1000, h=1000, pointsize=20)
heatmap.2(as.matrix(sampleDists), key=F, trace="none",
col=colorpanel(100, "black", "white"),
ColSideColors=mycols[groupList], RowSideColors=mycols[groupList],
margin=c(10, 10), main="Sample Distance Matrix")
dev.off()png("MA.png")
DESeq2::plotMA(res, main="DESeq2", ylim=c(-2,2))
dev.off()
## 重点就是这里啦，得到了差异分析的结果
resOrdered <- res[order(res$padj),]
resOrdered=as.data.frame(resOrdered)
write.csv(resOrdered,"deseq2.results.csv",quote = F)##下面也是一些图，主要是看看样本之间的差异情况library(limma)
plotMDS(log(counts(dds, normalized=TRUE) + 1))
plotMDS(log(counts(dds, normalized=TRUE) + 1) - log(t( t(assays(dds)[["mu"]]) / sizeFactors(dds) ) + 1))
plotMDS( assays(dds)[["counts"]] )  ## raw count
plotMDS( assays(dds)[["mu"]] ) ##- fitted values.

<说明>

exprSet