This function retrieves data from RegulonDB. Attributes from datasets can be selected and filtered.

get_dataset(
  regulondb,
  dataset = NULL,
  attributes = NULL,
  filters = NULL,
  and = TRUE,
  interval = NULL,
  partialmatch = NULL,
  output_format = "regulondb_result"
)

Arguments

regulondb

A regulondb() object.

dataset

Dataset of interest. Use the function list_datasets for an overview of valid datasets.

attributes

Vector of attributes to be retrieved.

filters

List of filters to be used. The names should correspond to the attribute and the values correspond to the condition for selection.

and

Logical argument. If FALSE, filters will be considered under the "OR" operator

interval

the filters whose values will be considered as interval

partialmatch

name of the condition(s) with a string pattern for full or partial match in the query

output_format

A string specifying the output format. Possible options are "regulondb_result", "GRanges", "DNAStringSet" or "BStringSet".

Value

By default, a regulon_results object. If specified in the parameter output_format, it can also return either a GRanges object or a Biostrings object.

Author

Carmina Barberena Jonas, Jesús Emiliano Sotelo Fonseca, José Alquicira Hernández, Joselyn Chávez

Examples

## Connect to the RegulonDB database if necessary
if (!exists("regulondb_conn")) regulondb_conn <- connect_database()
#> snapshotDate(): 2021-10-20

## Build the regulon db object
e_coli_regulondb <-
    regulondb(
        database_conn = regulondb_conn,
        organism = "E.coli",
        database_version = "1",
        genome_version = "1"
    )

## Obtain all the information from the "GENE" dataset
get_dataset(e_coli_regulondb, dataset = "GENE")
#> regulondb_result with 4602 rows and 33 columns
#>                id        name     bnumber          gi               synonyms
#>       <character> <character> <character> <character>            <character>
#> 1    ECK120000001         alr       b4053          NA ECK4045,EG10001,alr5..
#> 2    ECK120000002        modB       b0764          NA ECK0753,EG10002,b076..
#> 3    ECK120000003        cysZ       b2413          NA  ECK2408,EG10003,b2413
#> 4    ECK120000004         dfp       b3639          NA ECK3629,EG10004,b363..
#> 5    ECK120000005        dcuB       b4123          NA ECK4116,EG10006,b412..
#> ...           ...         ...         ...         ...                    ...
#> 4598 ECK125276531        yqfH       b4753          NA ECK4610,G0-16747,b4753
#> 4599 ECK125276532        yliM       b4736          NA ECK4593,G0-16731,b4736
#> 4600 ECK125276533        ynfS       b4750          NA ECK4607,G0-16744,b4750
#> 4601 ECK125276534        ylcJ       b4733          NA ECK4590,G0-16728,b4733
#> 4602 ECK125276535        sdhX       b4764          NA ECK4621,G0-17009,b47..
#>        posleft  posright      strand           dna_sequence
#>      <integer> <integer> <character>            <character>
#> 1      4265782   4266861     forward ATGCAAGCGGCAACTGTTGT..
#> 2       795862    796551     forward ATGATACTGACCGATCCAGA..
#> 3      2531463   2532224     forward ATGGTTTCATCATTCACATC..
#> 4      3812731   3813951     forward ATGAGCCTGGCCGGTAAAAA..
#> 5      4347404   4348744     reverse ATGTTATTTACTATCCAACT..
#> ...        ...       ...         ...                    ...
#> 4598   3032939   3033010     reverse ATGATTAACCAAGTGAGCGT..
#> 4599    850332    850397     forward ATGGAAACGTTCTGTTACAT..
#> 4600   1642122   1642211     forward ATGAATAACCCCGTCTGTCT..
#> 4601    568695    568844     forward ATGAGCCTCGTTTTATGCTT..
#> 4602    765050    765150     forward ATATCTGTAATAAGAAATAG..
#>            external_db_link      evidence_reference   product_id
#>                 <character>             <character>  <character>
#> 1    ASAP,http://asap.aha..                      NA ECK120004477
#> 2    ASAP,http://asap.aha..                      NA ECK120004478
#> 3    ASAP,http://asap.aha..                      NA ECK120004479
#> 4    ASAP,http://asap.aha..                      NA ECK120004480
#> 5    ASAP,http://asap.aha..                      NA ECK120004481
#> ...                     ...                     ...          ...
#> 4598                     NA                      NA ECK125276571
#> 4599                     NA                      NA ECK125276556
#> 4600                     NA                      NA ECK125276575
#> 4601                     NA                      NA ECK125276577
#> 4602                     NA \t\t\t30591570,30541135 ECK125276565
#>                product_name        product_synonym       product_sequence
#>                 <character>            <character>            <character>
#> 1        alanine racemase 1 Alr,alanine racemase.. MQAATVVINRRALRHNLQRL..
#> 2    molybdate ABC transp..         ChlJ,ModB,TslJ MILTDPEWQAVLLSLKVSSL..
#> 3    sulfate:H<sup>+</sup..                   CysZ MVSSFTSAPRSGFYYFAQGW..
#> 4    fused 4'-phosphopant..              CoaBC,Dfp MSLAGKKIVLGVSGGIAAYK..
#> 5    anaerobic C4-dicarbo..              DcuB,GenF MLFTIQLIIILICLFYGARK..
#> ...                     ...                    ...                    ...
#> 4598           protein YqfH                   YqfH MINQVSVYRQPPVLSGCRQV..
#> 4599           protein YliM                   YliM  METFCYMKWPVRHHKSRRVSH
#> 4600 Qin prophage; protei..                   YnfS MNNPVCLDDWLIGFKSLCCT..
#> 4601           protein YlcJ                   YlcJ MSLVLCFLLMSLFFMYSFVL..
#> 4602 small regulatory RNA..              RybD,SdhX AUAUCUGUAAUAAGAAAUAG..
#>      molecular_weigth isoelectric_point celullar_location
#>             <numeric>         <numeric>       <character>
#> 1              39.153             7.059           cytosol
#> 2              24.939            10.716    inner membrane
#> 3              29.305             9.809    inner membrane
#> 4              43.438             7.585           cytosol
#> 5              47.935             7.878    inner membrane
#> ...               ...               ...               ...
#> 4598            2.617            10.452                NA
#> 4599            2.716            10.877                NA
#> 4600            3.193             4.113    inner membrane
#> 4601            5.943            10.891                NA
#> 4602               NA                NA                NA
#>                product_note product_type             go_index_bp
#>                 <character>  <character>             <character>
#> 1                        NA           NA GO:0006522\tGO:000836..
#> 2    ModB is the predicte..           NA  GO:0015689\tGO:0055085
#> 3    CysZ is a high affin..           NA GO:0000103\tGO:000827..
#> 4    The <i>dfp</i> (<i>c..           NA GO:0008152\tGO:001593..
#> 5    DcuB is a C4-dicarbo..           NA GO:0009061\tGO:001574..
#> ...                     ...          ...                     ...
#> 4598 YqfH was identified ..           NA                      NA
#> 4599 YliM was identified ..           NA                      NA
#> 4600 YnfS was identified ..           NA                      NA
#> 4601 YlcJ was identified ..           NA                      NA
#> 4602 The small regulatory..    small RNA              GO:0040033
#>                   go_desc_bp             go_index_cc             go_desc_cc
#>                  <character>             <character>            <character>
#> 1     alanine metabolic pr..              GO:0005829                cytosol
#> 2     molybdate ion transp.. GO:0005886\tGO:000588.. plasma membrane,inte..
#> 3     sulfate assimilation.. GO:0005886\tGO:000588.. plasma membrane,inte..
#> 4    metabolic process\tco..  GO:0005737\tGO:0005829      cytoplasm,cytosol
#> 5     anaerobic respiratio.. GO:0005886\tGO:000588.. plasma membrane,inte..
#> ...                      ...                     ...                    ...
#> 4598                      NA                      NA                     NA
#> 4599                      NA                      NA                     NA
#> 4600                      NA GO:0005886\tGO:001602.. plasma membrane,memb..
#> 4601                      NA                      NA                     NA
#> 4602  negative regulation ..                      NA                     NA
#>                  go_index_mf              go_desc_mf product_external_db_links
#>                  <character>             <character>               <character>
#> 1    GO:0003824\tGO:000878.. catalytic activity\ta..   ECOCYC\thttp://biocyc..
#> 2                 GO:0015098  molybdate ion transm..   ECOCYC\thttp://biocyc..
#> 3     GO:0009675\tGO:0015116  high-affinity sulfat..   ECOCYC\thttp://biocyc..
#> 4    GO:0003824\tGO:000463.. catalytic activity\tp..   DIP\thttp://dip.doe-m..
#> 5     GO:0005469\tGO:0015556  succinate:fumarate a..   ECOCYC\thttp://biocyc..
#> ...                      ...                     ...                       ...
#> 4598                      NA                      NA   ECOCYC\thttp://biocyc..
#> 4599                      NA                      NA   ECOCYC\thttp://biocyc..
#> 4600                      NA                      NA   ECOCYC\thttp://biocyc..
#> 4601                      NA                      NA   ECOCYC\thttp://biocyc..
#> 4602  GO:0005515\tGO:0048027 protein binding\tmRNA..   ECOCYC\thttp://biocyc..
#>                product_ev_ref    operon_id          operon_name
#>                   <character>  <character>          <character>
#> 1                          NA ECK120029421                  alr
#> 2                          NA ECK120014822               modABC
#> 3      \t\t\t6341507,29792261 ECK120029422                 cysZ
#> 4                          NA ECK120030583                  dfp
#> 5      \t\t\t20860483,1512189 ECK120014710            dcuB-fumB
#> ...                       ...          ...                  ...
#> 4598                       NA           NA                   NA
#> 4599                       NA ECK125285808            yliM-ompX
#> 4600                       NA           NA                   NA
#> 4601                       NA           NA                   NA
#> 4602 IMP\t\tInferred from m.. ECK125285815 sdhCDAB-sucABCD-sdhX
#>                   tu_promoter    sigma_factor                 gene_tf
#>                   <character>     <character>             <character>
#> 1    ECK120029337\t\tECK120..              NA                      NA
#> 2    ECK120009489\tmodABC\t.. Sigma28,Sigma70 ECK120011235\tCRP ECK..
#> 3                          NA              NA                      NA
#> 4    ECK120030182\tdfp\tECK..              NA                      NA
#> 5     ECK120009650\tdcuB-fu..         Sigma70 ECK120011235\tCRP ECK..
#> ...                       ...             ...                     ...
#> 4598                       NA              NA                      NA
#> 4599  ECK120034934\tyliM-om..         Sigma70                      NA
#> 4600                       NA              NA                      NA
#> 4601                       NA              NA                      NA
#> 4602  ECK120009705\tsdhCDAB..         Sigma70 ECK120011345\tArcA EC..

## Get the attributes posright and name from the "GENE" dataset
get_dataset(e_coli_regulondb,
    dataset = "GENE",
    attributes = c("posright", "name")
)
#> regulondb_result with 4602 rows and 2 columns
#>       posright        name
#>      <integer> <character>
#> 1      4266861         alr
#> 2       796551        modB
#> 3      2532224        cysZ
#> 4      3813951         dfp
#> 5      4348744        dcuB
#> ...        ...         ...
#> 4598   3033010        yqfH
#> 4599    850397        yliM
#> 4600   1642211        ynfS
#> 4601    568844        ylcJ
#> 4602    765150        sdhX

## From "GENE" dataset, get the gene name, strand, posright, product name
## and id of all genes regulated with name like "ara", strand as "forward"
## with a position right between 2000 and 40000
get_dataset(
    e_coli_regulondb,
    dataset = "GENE",
    attributes = c("name", "strand", "posright", "product_name", "id"),
    filters = list(
        name = c("ara"),
        strand = c("forward"),
        posright = c("2000", "40000")
    ),
    and = TRUE,
    partialmatch = "name",
    interval = "posright"
)
#> regulondb_result with 1 row and 5 columns
#>          name      strand  posright           product_name           id
#>   <character> <character> <integer>            <character>  <character>
#> 1        carA     forward     30799 carbamoyl phosphate .. ECK120000130