diff --git a/.gitignore b/.gitignore index 264daca..2e04467 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -*__pycache__ \ No newline at end of file +*__pycache__ +*.ipynb \ No newline at end of file diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..f2d6bb9 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "GLIP"] + path = GLIP + url = https://github.com/Junital/GLIP.git diff --git a/GLIP b/GLIP new file mode 160000 index 0000000..c43038f --- /dev/null +++ b/GLIP @@ -0,0 +1 @@ +Subproject commit c43038f7a96e5515c125a2faa93bdf7ff6934a9f diff --git a/assets/COCO_train2014_000000022882.jpg b/assets/COCO_train2014_000000022882.jpg new file mode 100644 index 0000000..be36b62 Binary files /dev/null and b/assets/COCO_train2014_000000022882.jpg differ diff --git a/assets/COCO_train2014_000000269022.jpg b/assets/COCO_train2014_000000269022.jpg new file mode 100644 index 0000000..4bec226 Binary files /dev/null and b/assets/COCO_train2014_000000269022.jpg differ diff --git a/assets/COCO_train2014_000000576849.jpg b/assets/COCO_train2014_000000576849.jpg new file mode 100644 index 0000000..54c3037 Binary files /dev/null and b/assets/COCO_train2014_000000576849.jpg differ diff --git a/data/ExperImages.zip b/data/ExperImages.zip new file mode 100644 index 0000000..33ccd00 Binary files /dev/null and b/data/ExperImages.zip differ diff --git a/data/ExperImages/n103484.jpg b/data/ExperImages/n103484.jpg new file mode 100644 index 0000000..38d92b0 Binary files /dev/null and b/data/ExperImages/n103484.jpg differ diff --git a/data/ExperImages/n109961.jpg b/data/ExperImages/n109961.jpg new file mode 100644 index 0000000..f3773cf Binary files /dev/null and b/data/ExperImages/n109961.jpg differ diff --git a/data/ExperImages/n111074.jpg b/data/ExperImages/n111074.jpg new file mode 100644 index 0000000..ec71a56 Binary files /dev/null and b/data/ExperImages/n111074.jpg differ diff --git a/data/ExperImages/n115871.jpg b/data/ExperImages/n115871.jpg new file mode 100644 index 0000000..a64afba Binary files /dev/null and b/data/ExperImages/n115871.jpg differ diff --git a/data/ExperImages/n130253.jpg b/data/ExperImages/n130253.jpg new file mode 100644 index 0000000..38fda3f Binary files /dev/null and b/data/ExperImages/n130253.jpg differ diff --git a/data/ExperImages/n130371.jpg b/data/ExperImages/n130371.jpg new file mode 100644 index 0000000..3ad0334 Binary files /dev/null and b/data/ExperImages/n130371.jpg differ diff --git a/data/ExperImages/n133975.jpg b/data/ExperImages/n133975.jpg new file mode 100644 index 0000000..38a5d7c Binary files /dev/null and b/data/ExperImages/n133975.jpg differ diff --git a/data/ExperImages/n139491.jpg b/data/ExperImages/n139491.jpg new file mode 100644 index 0000000..10b8ecb Binary files /dev/null and b/data/ExperImages/n139491.jpg differ diff --git a/data/ExperImages/n147001.jpg b/data/ExperImages/n147001.jpg new file mode 100644 index 0000000..0230873 Binary files /dev/null and b/data/ExperImages/n147001.jpg differ diff --git a/data/ExperImages/n149859.jpg b/data/ExperImages/n149859.jpg new file mode 100644 index 0000000..9b66e60 Binary files /dev/null and b/data/ExperImages/n149859.jpg differ diff --git a/data/ExperImages/n154501.jpg b/data/ExperImages/n154501.jpg new file mode 100644 index 0000000..d3bf565 Binary files /dev/null and b/data/ExperImages/n154501.jpg differ diff --git a/data/ExperImages/n155297.jpg b/data/ExperImages/n155297.jpg new file mode 100644 index 0000000..2a3e3bd Binary files /dev/null and b/data/ExperImages/n155297.jpg differ diff --git a/data/ExperImages/n15719.jpg b/data/ExperImages/n15719.jpg new file mode 100644 index 0000000..739dc36 Binary files /dev/null and b/data/ExperImages/n15719.jpg differ diff --git a/data/ExperImages/n158542.jpg b/data/ExperImages/n158542.jpg new file mode 100644 index 0000000..4ffcefc Binary files /dev/null and b/data/ExperImages/n158542.jpg differ diff --git a/data/ExperImages/n170047.jpg b/data/ExperImages/n170047.jpg new file mode 100644 index 0000000..ff913a5 Binary files /dev/null and b/data/ExperImages/n170047.jpg differ diff --git a/data/ExperImages/n171693.jpg b/data/ExperImages/n171693.jpg new file mode 100644 index 0000000..293c803 Binary files /dev/null and b/data/ExperImages/n171693.jpg differ diff --git a/data/ExperImages/n173361.jpg b/data/ExperImages/n173361.jpg new file mode 100644 index 0000000..336f662 Binary files /dev/null and b/data/ExperImages/n173361.jpg differ diff --git a/data/ExperImages/n175869.jpg b/data/ExperImages/n175869.jpg new file mode 100644 index 0000000..8419275 Binary files /dev/null and b/data/ExperImages/n175869.jpg differ diff --git a/data/ExperImages/n178815.jpg b/data/ExperImages/n178815.jpg new file mode 100644 index 0000000..e12f8a8 Binary files /dev/null and b/data/ExperImages/n178815.jpg differ diff --git a/data/ExperImages/n181615.jpg b/data/ExperImages/n181615.jpg new file mode 100644 index 0000000..1362abc Binary files /dev/null and b/data/ExperImages/n181615.jpg differ diff --git a/data/ExperImages/n182120.jpg b/data/ExperImages/n182120.jpg new file mode 100644 index 0000000..11d2cdd Binary files /dev/null and b/data/ExperImages/n182120.jpg differ diff --git a/data/ExperImages/n184739.jpg b/data/ExperImages/n184739.jpg new file mode 100644 index 0000000..59b586c Binary files /dev/null and b/data/ExperImages/n184739.jpg differ diff --git a/data/ExperImages/n188669.jpg b/data/ExperImages/n188669.jpg new file mode 100644 index 0000000..3b94d55 Binary files /dev/null and b/data/ExperImages/n188669.jpg differ diff --git a/data/ExperImages/n189986.jpg b/data/ExperImages/n189986.jpg new file mode 100644 index 0000000..224be63 Binary files /dev/null and b/data/ExperImages/n189986.jpg differ diff --git a/data/ExperImages/n194711.jpg b/data/ExperImages/n194711.jpg new file mode 100644 index 0000000..d388188 Binary files /dev/null and b/data/ExperImages/n194711.jpg differ diff --git a/data/ExperImages/n196089.jpg b/data/ExperImages/n196089.jpg new file mode 100644 index 0000000..9622186 Binary files /dev/null and b/data/ExperImages/n196089.jpg differ diff --git a/data/ExperImages/n20290.jpg b/data/ExperImages/n20290.jpg new file mode 100644 index 0000000..0e73bec Binary files /dev/null and b/data/ExperImages/n20290.jpg differ diff --git a/data/ExperImages/n210059.jpg b/data/ExperImages/n210059.jpg new file mode 100644 index 0000000..b503d11 Binary files /dev/null and b/data/ExperImages/n210059.jpg differ diff --git a/data/ExperImages/n210277.jpg b/data/ExperImages/n210277.jpg new file mode 100644 index 0000000..a5d91cc Binary files /dev/null and b/data/ExperImages/n210277.jpg differ diff --git a/data/ExperImages/n214414.jpg b/data/ExperImages/n214414.jpg new file mode 100644 index 0000000..027767f Binary files /dev/null and b/data/ExperImages/n214414.jpg differ diff --git a/data/ExperImages/n215517.jpg b/data/ExperImages/n215517.jpg new file mode 100644 index 0000000..e0e67bc Binary files /dev/null and b/data/ExperImages/n215517.jpg differ diff --git a/data/ExperImages/n229656.jpg b/data/ExperImages/n229656.jpg new file mode 100644 index 0000000..01fa258 Binary files /dev/null and b/data/ExperImages/n229656.jpg differ diff --git a/data/ExperImages/n239383.jpg b/data/ExperImages/n239383.jpg new file mode 100644 index 0000000..3c147a1 Binary files /dev/null and b/data/ExperImages/n239383.jpg differ diff --git a/data/ExperImages/n24913.jpg b/data/ExperImages/n24913.jpg new file mode 100644 index 0000000..9b6c98e Binary files /dev/null and b/data/ExperImages/n24913.jpg differ diff --git a/data/ExperImages/n253231.jpg b/data/ExperImages/n253231.jpg new file mode 100644 index 0000000..3c31c2c Binary files /dev/null and b/data/ExperImages/n253231.jpg differ diff --git a/data/ExperImages/n258003.jpg b/data/ExperImages/n258003.jpg new file mode 100644 index 0000000..82b34f0 Binary files /dev/null and b/data/ExperImages/n258003.jpg differ diff --git a/data/ExperImages/n261100.jpg b/data/ExperImages/n261100.jpg new file mode 100644 index 0000000..c43ab19 Binary files /dev/null and b/data/ExperImages/n261100.jpg differ diff --git a/data/ExperImages/n264509.jpg b/data/ExperImages/n264509.jpg new file mode 100644 index 0000000..e734049 Binary files /dev/null and b/data/ExperImages/n264509.jpg differ diff --git a/data/ExperImages/n266971.jpg b/data/ExperImages/n266971.jpg new file mode 100644 index 0000000..e170f05 Binary files /dev/null and b/data/ExperImages/n266971.jpg differ diff --git a/data/ExperImages/n267826.jpg b/data/ExperImages/n267826.jpg new file mode 100644 index 0000000..64e7161 Binary files /dev/null and b/data/ExperImages/n267826.jpg differ diff --git a/data/ExperImages/n274318.jpg b/data/ExperImages/n274318.jpg new file mode 100644 index 0000000..3dd7a7f Binary files /dev/null and b/data/ExperImages/n274318.jpg differ diff --git a/data/ExperImages/n275523.jpg b/data/ExperImages/n275523.jpg new file mode 100644 index 0000000..0907916 Binary files /dev/null and b/data/ExperImages/n275523.jpg differ diff --git a/data/ExperImages/n287162.jpg b/data/ExperImages/n287162.jpg new file mode 100644 index 0000000..4a2c104 Binary files /dev/null and b/data/ExperImages/n287162.jpg differ diff --git a/data/ExperImages/n298165.jpg b/data/ExperImages/n298165.jpg new file mode 100644 index 0000000..68e53d4 Binary files /dev/null and b/data/ExperImages/n298165.jpg differ diff --git a/data/ExperImages/n299577.jpg b/data/ExperImages/n299577.jpg new file mode 100644 index 0000000..6844b44 Binary files /dev/null and b/data/ExperImages/n299577.jpg differ diff --git a/data/ExperImages/n307753.jpg b/data/ExperImages/n307753.jpg new file mode 100644 index 0000000..6ffd362 Binary files /dev/null and b/data/ExperImages/n307753.jpg differ diff --git a/data/ExperImages/n311711.jpg b/data/ExperImages/n311711.jpg new file mode 100644 index 0000000..3d01a7c Binary files /dev/null and b/data/ExperImages/n311711.jpg differ diff --git a/data/ExperImages/n312992.jpg b/data/ExperImages/n312992.jpg new file mode 100644 index 0000000..45fa9dc Binary files /dev/null and b/data/ExperImages/n312992.jpg differ diff --git a/data/ExperImages/n318563.jpg b/data/ExperImages/n318563.jpg new file mode 100644 index 0000000..757b186 Binary files /dev/null and b/data/ExperImages/n318563.jpg differ diff --git a/data/ExperImages/n324512.jpg b/data/ExperImages/n324512.jpg new file mode 100644 index 0000000..8fc1d1e Binary files /dev/null and b/data/ExperImages/n324512.jpg differ diff --git a/data/ExperImages/n339300.jpg b/data/ExperImages/n339300.jpg new file mode 100644 index 0000000..2ad9807 Binary files /dev/null and b/data/ExperImages/n339300.jpg differ diff --git a/data/ExperImages/n339728.jpg b/data/ExperImages/n339728.jpg new file mode 100644 index 0000000..8d04212 Binary files /dev/null and b/data/ExperImages/n339728.jpg differ diff --git a/data/ExperImages/n341278.jpg b/data/ExperImages/n341278.jpg new file mode 100644 index 0000000..16b119b Binary files /dev/null and b/data/ExperImages/n341278.jpg differ diff --git a/data/ExperImages/n343034.jpg b/data/ExperImages/n343034.jpg new file mode 100644 index 0000000..264d7cd Binary files /dev/null and b/data/ExperImages/n343034.jpg differ diff --git a/data/ExperImages/n345315.jpg b/data/ExperImages/n345315.jpg new file mode 100644 index 0000000..cd90c7b Binary files /dev/null and b/data/ExperImages/n345315.jpg differ diff --git a/data/ExperImages/n345363.jpg b/data/ExperImages/n345363.jpg new file mode 100644 index 0000000..1dc750b Binary files /dev/null and b/data/ExperImages/n345363.jpg differ diff --git a/data/ExperImages/n349224.jpg b/data/ExperImages/n349224.jpg new file mode 100644 index 0000000..493ad91 Binary files /dev/null and b/data/ExperImages/n349224.jpg differ diff --git a/data/ExperImages/n366949.jpg b/data/ExperImages/n366949.jpg new file mode 100644 index 0000000..5574ba2 Binary files /dev/null and b/data/ExperImages/n366949.jpg differ diff --git a/data/ExperImages/n383044.jpg b/data/ExperImages/n383044.jpg new file mode 100644 index 0000000..78d999c Binary files /dev/null and b/data/ExperImages/n383044.jpg differ diff --git a/data/ExperImages/n394813.jpg b/data/ExperImages/n394813.jpg new file mode 100644 index 0000000..76fdb55 Binary files /dev/null and b/data/ExperImages/n394813.jpg differ diff --git a/data/ExperImages/n406179.jpg b/data/ExperImages/n406179.jpg new file mode 100644 index 0000000..fd8003a Binary files /dev/null and b/data/ExperImages/n406179.jpg differ diff --git a/data/ExperImages/n408516.jpg b/data/ExperImages/n408516.jpg new file mode 100644 index 0000000..5ad0d9f Binary files /dev/null and b/data/ExperImages/n408516.jpg differ diff --git a/data/ExperImages/n409008.jpg b/data/ExperImages/n409008.jpg new file mode 100644 index 0000000..ed994af Binary files /dev/null and b/data/ExperImages/n409008.jpg differ diff --git a/data/ExperImages/n41686.jpg b/data/ExperImages/n41686.jpg new file mode 100644 index 0000000..24e529e Binary files /dev/null and b/data/ExperImages/n41686.jpg differ diff --git a/data/ExperImages/n424704.jpg b/data/ExperImages/n424704.jpg new file mode 100644 index 0000000..6dd7044 Binary files /dev/null and b/data/ExperImages/n424704.jpg differ diff --git a/data/ExperImages/n437038.jpg b/data/ExperImages/n437038.jpg new file mode 100644 index 0000000..fc327b0 Binary files /dev/null and b/data/ExperImages/n437038.jpg differ diff --git a/data/ExperImages/n464936.jpg b/data/ExperImages/n464936.jpg new file mode 100644 index 0000000..677b950 Binary files /dev/null and b/data/ExperImages/n464936.jpg differ diff --git a/data/ExperImages/n466504.jpg b/data/ExperImages/n466504.jpg new file mode 100644 index 0000000..6a877bb Binary files /dev/null and b/data/ExperImages/n466504.jpg differ diff --git a/data/ExperImages/n468138.jpg b/data/ExperImages/n468138.jpg new file mode 100644 index 0000000..2396fcd Binary files /dev/null and b/data/ExperImages/n468138.jpg differ diff --git a/data/ExperImages/n474949.jpg b/data/ExperImages/n474949.jpg new file mode 100644 index 0000000..3c7c8e3 Binary files /dev/null and b/data/ExperImages/n474949.jpg differ diff --git a/data/ExperImages/n475122.jpg b/data/ExperImages/n475122.jpg new file mode 100644 index 0000000..d438334 Binary files /dev/null and b/data/ExperImages/n475122.jpg differ diff --git a/data/ExperImages/n479684.jpg b/data/ExperImages/n479684.jpg new file mode 100644 index 0000000..b8cc80e Binary files /dev/null and b/data/ExperImages/n479684.jpg differ diff --git a/data/ExperImages/n487547.jpg b/data/ExperImages/n487547.jpg new file mode 100644 index 0000000..ce59d37 Binary files /dev/null and b/data/ExperImages/n487547.jpg differ diff --git a/data/ExperImages/n488098.jpg b/data/ExperImages/n488098.jpg new file mode 100644 index 0000000..5ad32a3 Binary files /dev/null and b/data/ExperImages/n488098.jpg differ diff --git a/data/ExperImages/n49911.jpg b/data/ExperImages/n49911.jpg new file mode 100644 index 0000000..213dc3f Binary files /dev/null and b/data/ExperImages/n49911.jpg differ diff --git a/data/ExperImages/n511793.jpg b/data/ExperImages/n511793.jpg new file mode 100644 index 0000000..08afe81 Binary files /dev/null and b/data/ExperImages/n511793.jpg differ diff --git a/data/ExperImages/n51303.jpg b/data/ExperImages/n51303.jpg new file mode 100644 index 0000000..f94f76b Binary files /dev/null and b/data/ExperImages/n51303.jpg differ diff --git a/data/ExperImages/n513747.jpg b/data/ExperImages/n513747.jpg new file mode 100644 index 0000000..2f3001a Binary files /dev/null and b/data/ExperImages/n513747.jpg differ diff --git a/data/ExperImages/n514077.jpg b/data/ExperImages/n514077.jpg new file mode 100644 index 0000000..78d4fba Binary files /dev/null and b/data/ExperImages/n514077.jpg differ diff --git a/data/ExperImages/n515157.jpg b/data/ExperImages/n515157.jpg new file mode 100644 index 0000000..049813a Binary files /dev/null and b/data/ExperImages/n515157.jpg differ diff --git a/data/ExperImages/n524673.jpg b/data/ExperImages/n524673.jpg new file mode 100644 index 0000000..76187e3 Binary files /dev/null and b/data/ExperImages/n524673.jpg differ diff --git a/data/ExperImages/n525013.jpg b/data/ExperImages/n525013.jpg new file mode 100644 index 0000000..0a1555b Binary files /dev/null and b/data/ExperImages/n525013.jpg differ diff --git a/data/ExperImages/n536090.jpg b/data/ExperImages/n536090.jpg new file mode 100644 index 0000000..42fae4d Binary files /dev/null and b/data/ExperImages/n536090.jpg differ diff --git a/data/ExperImages/n537813.jpg b/data/ExperImages/n537813.jpg new file mode 100644 index 0000000..d11b18a Binary files /dev/null and b/data/ExperImages/n537813.jpg differ diff --git a/data/ExperImages/n544799.jpg b/data/ExperImages/n544799.jpg new file mode 100644 index 0000000..3a92f44 Binary files /dev/null and b/data/ExperImages/n544799.jpg differ diff --git a/data/ExperImages/n550668.jpg b/data/ExperImages/n550668.jpg new file mode 100644 index 0000000..5b9d4a4 Binary files /dev/null and b/data/ExperImages/n550668.jpg differ diff --git a/data/ExperImages/n553018.jpg b/data/ExperImages/n553018.jpg new file mode 100644 index 0000000..999fba3 Binary files /dev/null and b/data/ExperImages/n553018.jpg differ diff --git a/data/ExperImages/n554025.jpg b/data/ExperImages/n554025.jpg new file mode 100644 index 0000000..c91c4c9 Binary files /dev/null and b/data/ExperImages/n554025.jpg differ diff --git a/data/ExperImages/n557683.jpg b/data/ExperImages/n557683.jpg new file mode 100644 index 0000000..83581d2 Binary files /dev/null and b/data/ExperImages/n557683.jpg differ diff --git a/data/ExperImages/n560895.jpg b/data/ExperImages/n560895.jpg new file mode 100644 index 0000000..4576ba5 Binary files /dev/null and b/data/ExperImages/n560895.jpg differ diff --git a/data/ExperImages/n56556.jpg b/data/ExperImages/n56556.jpg new file mode 100644 index 0000000..b526630 Binary files /dev/null and b/data/ExperImages/n56556.jpg differ diff --git a/data/ExperImages/n565573.jpg b/data/ExperImages/n565573.jpg new file mode 100644 index 0000000..b3922c2 Binary files /dev/null and b/data/ExperImages/n565573.jpg differ diff --git a/data/ExperImages/n568258.jpg b/data/ExperImages/n568258.jpg new file mode 100644 index 0000000..b9c1330 Binary files /dev/null and b/data/ExperImages/n568258.jpg differ diff --git a/data/ExperImages/n59657.jpg b/data/ExperImages/n59657.jpg new file mode 100644 index 0000000..9e98dd6 Binary files /dev/null and b/data/ExperImages/n59657.jpg differ diff --git a/data/ExperImages/n59853.jpg b/data/ExperImages/n59853.jpg new file mode 100644 index 0000000..2cf18cb Binary files /dev/null and b/data/ExperImages/n59853.jpg differ diff --git a/data/ExperImages/n60178.jpg b/data/ExperImages/n60178.jpg new file mode 100644 index 0000000..5ee3f22 Binary files /dev/null and b/data/ExperImages/n60178.jpg differ diff --git a/data/ExperImages/n61019.jpg b/data/ExperImages/n61019.jpg new file mode 100644 index 0000000..5d026f8 Binary files /dev/null and b/data/ExperImages/n61019.jpg differ diff --git a/data/ExperImages/n66626.jpg b/data/ExperImages/n66626.jpg new file mode 100644 index 0000000..c046837 Binary files /dev/null and b/data/ExperImages/n66626.jpg differ diff --git a/data/ExperImages/n93063.jpg b/data/ExperImages/n93063.jpg new file mode 100644 index 0000000..f1d7c68 Binary files /dev/null and b/data/ExperImages/n93063.jpg differ diff --git a/data/problem.xlsx b/data/problem.xlsx new file mode 100644 index 0000000..f1e7cf4 Binary files /dev/null and b/data/problem.xlsx differ diff --git a/data/test.csv b/data/test.csv new file mode 100644 index 0000000..3662a71 --- /dev/null +++ b/data/test.csv @@ -0,0 +1,101 @@ +isBalanced,question,imageId +True,Which kind of furniture is below the decoration?,n274318 +True,Which kind of clothing is warm?,n115871 +True,Which kind of clothing is not black?,n184739 +True,Do the pants that are not dirty look large?,n66626 +True,The lamp that is not turned-off is sitting on top of what?,n210059 +True,What kind of furniture is made of metal?,n188669 +True,Is the shirt orange or blue?,n554025 +True,Do you see either any white towels or pillows?,n468138 +True,Is the closed drawer to the right of a chair?,n15719 +True,What is the lamp made of?,n133975 +True,What does the man wear?,n51303 +True,What is under the cooking utensil made of wood?,n349224 +True,Are the end table and the desk made of the same material?,n194711 +True,Is the counter above a drawer?,n229656 +True,Are the pants black and long?,n149859 +True,What is the cup to the left of the keyboard made of?,n475122 +True,Is the standing woman behind the tomatoes wearing a hat?,n299577 +True,Where is the man in front of the fence standing on?,n464936 +True,What appliance is not used?,n409008 +True,Do you see any women inside the library?,n93063 +True,How large is the device the computer monitor is beside of?,n479684 +True,The bun is on what?,n24913 +True,What is the boy doing?,n49911 +True,Is there a bed or a desk in this picture?,n170047 +True,Does the gate look metallic and tall?,n196089 +True,Does the mirror look clean and brown?,n266971 +True,What is in front of the street light?,n511793 +True,Is the helmet on the right?,n339728 +True,Are there white desks in the picture?,n345363 +True,Is the house both white and small?,n109961 +True,Is the bag behind a chair?,n214414 +True,Is the garbage can to the left of the people?,n147001 +True,Are the benches in front of the person hard and red?,n324512 +True,Is there a train above the street that is made of brick?,n253231 +True,The tomatoes are in what?,n557683 +True,Is the plate on a counter?,n318563 +True,Are there any women in the photo that are not riding?,n59657 +True,What is the piece of furniture that is hanging above the wall called?,n61019 +True,Is the plastic bucket to the right or to the left of the toilet that is white?,n171693 +True,What is the man wearing?,n275523 +True,Is there a blue window in this picture?,n189986 +True,Does the dress look short sleeved?,n515157 +True,How large is the pitcher that the ketchup is in front of?,n158542 +True,What is in front of the trees?,n139491 +True,Does the person that is not old wear a hat?,n560895 +True,Is the happy person on the left or on the right side of the image?,n383044 +True,Is the empty bottle on the right side or on the left?,n60178 +True,Which kind of clothing is light colored?,n111074 +True,Do you see tables in the photo?,n154501 +True,What animal is standing on the small boat?,n178815 +True,What is the plate near the candle holder sitting atop?,n264509 +True,Which place is it?,n424704 +True,What is the device in front of the person that is sitting on the ground called?,n20290 +True,Do you see end tables next to the sofa on the left part of the photo?,n307753 +True,What is under the windows?,n341278 +True,Does that sweatshirt have striped pattern and gray color?,n345315 +True,What is the color of the toilet paper?,n287162 +True,What's the man doing?,n513747 +True,Which device is on?,n59853 +True,Are there black sandals or boots?,n41686 +True,Is the plastic cup to the left of the other cup small and colorful?,n215517 +True,What kind of furniture are the drawers in?,n133975 +True,Are the men to the left of a bowl?,n298165 +True,What animal is brown?,n406179 +True,What is the color of the wine glasses?,n544799 +True,Are there both cars and fences in the photo?,n130253 +True,Are there either any plates or breads in the image?,n267826 +True,What is on the motorcycle?,n339300 +True,How large are the rocks?,n408516 +True,Which kind of food is bigger than the blueberry?,n466504 +True,What is the woman wearing?,n537813 +True,What does the man wear?,n366949 +True,"Which side of the picture are the bags on, the left or the right?",n568258 +True,Where is the snowboarder standing on?,n553018 +True,How is this vehicle called?,n487547 +True,Who is holding onto the umbrella?,n103484 +True,What is the toilet brush next to the toilet made of?,n514077 +True,Who in this image is talking?,n343034 +True,What type of fast food is the woman that is not old looking at?,n394813 +True,Are the daughter that is not old and the daughter to the right of the dad both happy?,n155297 +True,Are the people that are talking leaning on the wood fence?,n181615 +True,Is the clock both round and gold?,n239383 +True,Is the girl to the left of the backpack waiting or playing?,n437038 +True,Is the table gray?,n56556 +True,What is the color of the bread that is not little?,n312992 +True,What is in front of the trees?,n536090 +True,Is that street sign black and small?,n311711 +True,Is that fork made of stainless steel?,n474949 +True,What is the weather like in the picture?,n130371 +True,Is there a mug on top of the bench?,n261100 +True,Is the sand both black and wet?,n175869 +True,What is the aircraft that the woman is looking at?,n524673 +True,What pieces of furniture are to the left of the car?,n173361 +True,Who in this photograph is looking down?,n488098 +True,Is the sink short or tall?,n182120 +True,What is that fence in front of?,n258003 +True,Are there any chairs near the decorative painting?,n210277 +True,How clean is the headband that the girl is wearing?,n525013 +True,Is the small building behind or in front of the bushy tree?,n550668 +True,Does the umbrella look female and white?,n565573 diff --git a/data/visualization.xlsx b/data/visualization.xlsx new file mode 100644 index 0000000..97de500 Binary files /dev/null and b/data/visualization.xlsx differ diff --git a/download_GLIP.sh b/download_GLIP.sh new file mode 100644 index 0000000..6e78006 --- /dev/null +++ b/download_GLIP.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# change this to your preferred download location +PRETRAINED_MODELS_PATH=./pretrained_models + +# GLIP model +mkdir -p $PRETRAINED_MODELS_PATH/GLIP/checkpoints +mkdir -p $PRETRAINED_MODELS_PATH/GLIP/configs +wget -nc -P $PRETRAINED_MODELS_PATH/GLIP/checkpoints https://huggingface.co/GLIPModel/GLIP/resolve/main/glip_large_model.pth +wget -nc -P $PRETRAINED_MODELS_PATH/GLIP/configs https://raw.githubusercontent.com/microsoft/GLIP/main/configs/pretrain/glip_Swin_L.yaml diff --git a/engine/object_graph.py b/engine/object_graph.py new file mode 100644 index 0000000..ed7c593 --- /dev/null +++ b/engine/object_graph.py @@ -0,0 +1,39 @@ +class ObjectGraph: + + def __init__(self, Name: str, Box, + Category : str = "object", + Location : tuple = (0, 0), + Size : tuple = (0, 0)) -> None: + + self.Attribute = dict( + Name = Name, + Box = Box, + Category = Category, + Location = Location, + Size = Size + ) + + def add(self, key: str, value: any) -> None: + + self.Attribute[key] = value + +class ObjectGraphGroup: + + def __init__(self, img=None, groupA=None, groupB=None): + if img is not None: + self.Graphs = [] + self.Relations = dict() + self.Img = img + elif groupA is not None and groupB is not None: + self.Graphs = groupA.Graphs + groupB.Graphs + self.Relations = groupA.Relations | groupB.Relations + self.Img = groupA.Img + else: + raise ValueError("You must give a image or give two groups.") + + def add_graph(self, graph: ObjectGraph): + + self.Graphs.append(graph) + + def add_relation(self, objA, objB, relation): + self.Relations[(objA, objB)] = relation diff --git a/engine/step_interpreters.py b/engine/step_interpreters.py index c588497..3f0584a 100644 --- a/engine/step_interpreters.py +++ b/engine/step_interpreters.py @@ -1,7 +1,10 @@ import cv2 import os import torch -import openai +import sys +import timeit +from typing import Union +from openai import OpenAI import functools import numpy as np import face_detection @@ -11,9 +14,14 @@ from PIL import Image,ImageDraw,ImageFont,ImageFilter from transformers import (ViltProcessor, ViltForQuestionAnswering, OwlViTProcessor, OwlViTForObjectDetection, + AutoModelForZeroShotObjectDetection, DetrImageProcessor, DetrForObjectDetection, MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation, CLIPProcessor, CLIPModel, AutoProcessor, BlipForQuestionAnswering) from diffusers import StableDiffusionInpaintPipeline +from .object_graph import ObjectGraph, ObjectGraphGroup +import re +from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo, to_image_list, create_positive_map, \ + create_positive_map_label_to_token_from_positive_map from .nms import nms from vis_utils import html_embed_image, html_colored_span, vis_masks @@ -191,18 +199,483 @@ def execute(self,prog_step,inspect=False): return answer +class BuildInterpreter(): + step_name = "BUILD" + + def __init__(self): + self.vqa = VQAInterpreter() + + self.crop = CropInterpreter() + + def parse(self,prog_step): + parse_result = parse_step(prog_step.prog_str) + step_name = parse_result['step_name'] + objs_var = parse_result['args']['objects'] + output_var = parse_result['output_var'] + assert(step_name==self.step_name) + return objs_var,output_var + + def execute(self, prog_step,inspect=False): + objs_var, output_var = self.parse(prog_step) + + objs = prog_step.state[objs_var] + img = objs["img"] + category = objs["category"] + + graphs = ObjectGraphGroup(img) + + for obj in objs["box"]: + croped_img = self.crop.cropbox(obj, img) + + answer = self.vqa.predict(croped_img, f"What's this {category}?") + + mid_w, mid_h = (obj[0] + obj[2]) / 2, (obj[1] + obj[3]) / 2 + w, h = img.size + location = (mid_w / w, mid_h / h) + size = (obj[2] - obj[0]), obj[3] - obj[1] + + graphs.add_graph(ObjectGraph(answer, obj, category, location, size)) + + print(graphs.Graphs) + prog_step.state[output_var] = graphs + +class ADDInterpreter(): + step_name = 'ADD' + + def __init__(self): + self.vqa = VQAInterpreter() + + def parse(self,prog_step): + parse_result = parse_step(prog_step.prog_str) + step_name = parse_result['step_name'] + graph_var = parse_result['args']['graph'] + attribute_str = parse_result['args']['attribute'] + output_var = parse_result['output_var'] + assert(step_name==self.step_name) + return graph_var,attribute_str,output_var + + def execute(self, prog_step,inspect=False): + graph_var, attribute_str, output_var = self.parse(prog_step) + + graph = prog_step.state[graph_var] + img = graph.Img + + res_graph = ObjectGraphGroup(img=img) + + for obj in graph.Graphs: + if attribute_str in obj.Attribute: + pass + else: + cropped_img = img.crop(obj.Attribute["Box"]) + obj_name = obj.Attribute["Name"] + answer = self.vqa.predict(cropped_img, f"What's the {attribute_str} of this {obj_name}?") + + obj.add(attribute_str, answer) + + res_graph.add_graph(obj) + + prog_step.state[output_var] = res_graph + + return res_graph + +class MERGEInterpreter(): + step_name = 'MERGE' + + RELATION_MESSAGE = [ + { + "role": "system", + "content": """Given the relationship (subject, object): relationship, generate a question that asks about this relationship. + +For example: +**Input**: (bottles, wine): right_of +**Output**: "Is the bottles to the right of the wine?" + + +3. should be converted to "Is the book under the table?" + +Please follow this format to create the questions. +""" + }, + { + "role": "user", + "content": """**Input:** (people, umbrella): carry""" + }, + { + "role": "assistant", + "content": """**Output:** "Is the people carrying the umbrella?" """ + }, + { + "role": "user", + "content": """**Input:** (book, table): under""" + }, + { + "role": "assistant", + "content": """**Output:** "Is the book under the table?" """ + } + ] + + def __init__(self): + self.vqa = VQAInterpreter() + self.client = OpenAI() + + def parse(self,prog_step): + parse_result = parse_step(prog_step.prog_str) + step_name = parse_result['step_name'] + graph1_var = parse_result['args']['graphA'] + graph2_var = parse_result['args']['graphB'] + relation_str = parse_result['args']['relation'] + output_var = parse_result['output_var'] + assert(step_name==self.step_name) + return graph1_var,graph2_var,relation_str,output_var + + def llm_ask(self, relation): + message = self.RELATION_MESSAGE + message.append({"role": "user", "content": f"**Input:** {relation}"}) + + response = self.client.chat.completions.create( + model="gpt-4o", + temperature=0.8, + messages=message + ) + + answer = response.choices[0].message.content + + matches = re.findall(r'"([^}]*)"', answer) + + if len(matches) > 0: + return matches[0] + + return "" + + def get_categories(self, group): + categories = [] + + for graph in group.Graphs: + if graph.Attribute["Category"] not in categories: + categories.append(graph.Attribute["Category"]) + + return categories + + def focus_image(self, img, box1, box2): + black_image = Image.new('RGB', img.size, (0, 0, 0)) + + region1 = img.crop(box1) + black_image.paste(region1, (box1[0], box1[1])) + + region2 = img.crop(box2) + black_image.paste(region2, (box2[0], box2[1])) + + return black_image + + def execute(self, prog_step,inspect=False): + graph1_var, graph2_var, relation_str, output_var = self.parse(prog_step) + + graph1 = prog_step.state[graph1_var] + graph2 = prog_step.state[graph2_var] + + if(relation_str == "None"): + + merged_graph = ObjectGraphGroup(groupA=graph1, groupB=graph2) + + else: + merged_graph = ObjectGraphGroup(groupA=graph1, groupB=graph2) + img = merged_graph.Img + + for objA in graph1.Graphs: + for objB in graph2.Graphs: + new_img = self.focus_image(img, objA.Attribute["Box"], objB.Attribute["Box"]) + new_img.save("test.jpg") + + Aname = objA.Attribute["Name"] + Bname = objB.Attribute["Name"] + + question = self.llm_ask(f"({Aname}, {Bname}): {relation_str}") + print(question) + + answer = self.vqa.predict(new_img, question) + + if(answer == "yes"): + merged_graph.add_relation(objA, objB, relation_str) + + prog_step.state[output_var] = merged_graph + + return merged_graph + +class HiddenPrints: + hide_prints = False + + def __init__(self, model_name=None, console=None, use_newline=True): + self.model_name = model_name + self.console = console + self.use_newline = use_newline + self.tqdm_aux = None + + def __enter__(self): + if self.hide_prints: + import tqdm # We need to do an extra step to hide tqdm outputs. Does not work in Jupyter Notebooks. + + def nop(it, *a, **k): + return it + + self.tqdm_aux = tqdm.tqdm + tqdm.tqdm = nop + + if self.model_name is not None: + self.console.print(f'Loading {self.model_name}...') + self._original_stdout = sys.stdout + self._original_stderr = sys.stderr + sys.stdout = open(os.devnull, 'w') + # May not be what we always want, but some annoying warnings end up to stderr + sys.stderr = open(os.devnull, 'w') + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.hide_prints: + sys.stdout.close() + sys.stdout = self._original_stdout + sys.stdout = self._original_stderr + if self.model_name is not None: + self.console.print(f'{self.model_name} loaded ') + import tqdm + tqdm.tqdm = self.tqdm_aux + +class GLIPLocInterpreter(GLIPDemo): + step_name = 'LOC' + + def __init__(self, *args_demo): + + working_dir = f'./pretrained_models/GLIP/' + + config_file = working_dir + "configs/glip_Swin_L.yaml" + weight_file = working_dir + "checkpoints/glip_large_model.pth" + + kwargs = { + 'min_image_size': 800, + 'confidence_threshold': 0.5, + 'show_mask_heatmaps': False + } + + self.dev = "cuda" if torch.cuda.is_available() else "cpu" + + from maskrcnn_benchmark.config import cfg + + # manual override some options + cfg.local_rank = 0 + cfg.num_gpus = 1 + cfg.merge_from_file(config_file) + cfg.merge_from_list(["MODEL.WEIGHT", weight_file]) + cfg.merge_from_list(["MODEL.DEVICE", self.dev]) + + with HiddenPrints("GLIP"), torch.cuda.device(self.dev): + from transformers.utils import logging + logging.set_verbosity_error() + GLIPDemo.__init__(self, cfg, *args_demo, **kwargs) + if self.cfg.MODEL.RPN_ARCHITECTURE == "VLDYHEAD": + plus = 1 + else: + plus = 0 + self.plus = plus + self.color = 255 + + @torch.no_grad() + def compute_prediction(self, original_image, original_caption, custom_entity=None): + image = self.transforms(original_image) + # image = [image, image.permute(0, 2, 1)] + image_list = to_image_list(image, self.cfg.DATALOADER.SIZE_DIVISIBILITY) + image_list = image_list.to(self.dev) + # caption + if isinstance(original_caption, list): + + if len(original_caption) > 40: + all_predictions = None + for loop_num, i in enumerate(range(0, len(original_caption), 40)): + list_step = original_caption[i:i + 40] + prediction_step = self.compute_prediction(original_image, list_step, custom_entity=None) + if all_predictions is None: + all_predictions = prediction_step + else: + # Aggregate predictions + all_predictions.bbox = torch.cat((all_predictions.bbox, prediction_step.bbox), dim=0) + for k in all_predictions.extra_fields: + all_predictions.extra_fields[k] = \ + torch.cat((all_predictions.extra_fields[k], + prediction_step.extra_fields[k] + loop_num), dim=0) + return all_predictions + + # we directly provided a list of category names + caption_string = "" + tokens_positive = [] + seperation_tokens = " . " + for word in original_caption: + tokens_positive.append([len(caption_string), len(caption_string) + len(word)]) + caption_string += word + caption_string += seperation_tokens + + tokenized = self.tokenizer([caption_string], return_tensors="pt") + # tokens_positive = [tokens_positive] # This was wrong + tokens_positive = [[v] for v in tokens_positive] + + original_caption = caption_string + # print(tokens_positive) + else: + tokenized = self.tokenizer([original_caption], return_tensors="pt") + if custom_entity is None: + tokens_positive = self.run_ner(original_caption) + # print(tokens_positive) + # process positive map + positive_map = create_positive_map(tokenized, tokens_positive) + + positive_map_label_to_token = create_positive_map_label_to_token_from_positive_map(positive_map, + plus=self.plus) + self.positive_map_label_to_token = positive_map_label_to_token + tic = timeit.time.perf_counter() + + # compute predictions + with HiddenPrints(): # Hide some deprecated notices + predictions = self.model(image_list, captions=[original_caption], + positive_map=positive_map_label_to_token) + predictions = [o.to(self.cpu_device) for o in predictions] + # print("inference time per image: {}".format(timeit.time.perf_counter() - tic)) + + # always single image is passed at a time + prediction = predictions[0] + + # reshape prediction (a BoxList) into the original image size + height, width = original_image.shape[-2:] + # if self.tensor_inputs: + # else: + # height, width = original_image.shape[:-1] + prediction = prediction.resize((width, height)) + + if prediction.has_field("mask"): + # if we have masks, paste the masks in the right position + # in the image, as defined by the bounding boxes + masks = prediction.get_field("mask") + # always single image is passed at a time + masks = self.masker([masks], [prediction])[0] + prediction.add_field("mask", masks) + + return prediction + + @staticmethod + def to_left_right_upper_lower(bboxes): + return [(bbox[1], bbox[3], bbox[0], bbox[2]) for bbox in bboxes] + + @staticmethod + def to_xmin_ymin_xmax_ymax(bboxes): + # invert the previous method + return [(bbox[2], bbox[0], bbox[3], bbox[1]) for bbox in bboxes] + + @staticmethod + def prepare_image(image): + image = image[[2, 1, 0]] # convert to bgr for opencv-format for glip + return image + + @torch.no_grad() + def forward(self, image: torch.Tensor, obj: Union[str, list], return_labels: bool = False, + confidence_threshold=None): + + if confidence_threshold is not None: + original_confidence_threshold = self.confidence_threshold + self.confidence_threshold = confidence_threshold + + # if isinstance(object, list): + # object = ' . '.join(object) + ' .' # add separation tokens + image = self.prepare_image(image) + + # Avoid the resizing creating a huge image in a pathological case + ratio = image.shape[1] / image.shape[2] + ratio = max(ratio, 1 / ratio) + original_min_image_size = self.min_image_size + if ratio > 10: + self.min_image_size = int(original_min_image_size * 10 / ratio) + self.transforms = self.build_transform() + + with torch.cuda.device(self.dev): + inference_output = self.inference(image, obj) + + bboxes = inference_output.bbox.cpu().numpy().astype(int) + # bboxes = self.to_left_right_upper_lower(bboxes) + + if ratio > 10: + self.min_image_size = original_min_image_size + self.transforms = self.build_transform() + + bboxes = torch.tensor(bboxes) + + # Convert to [left, lower, right, upper] instead of [left, upper, right, lower] + height = image.shape[-2] + bboxes = torch.stack([bboxes[:, 0], height - bboxes[:, 3], bboxes[:, 2], height - bboxes[:, 1]], dim=1) + + if confidence_threshold is not None: + self.confidence_threshold = original_confidence_threshold + if return_labels: + # subtract 1 because it's 1-indexed for some reason + return bboxes, inference_output.get_field("labels").cpu().numpy() - 1 + return bboxes + + def box_image(self,img,boxes,highlight_best=True): + img1 = img.copy() + draw = ImageDraw.Draw(img1) + for i,box in enumerate(boxes): + if i==0 and highlight_best: + color = 'red' + else: + color = 'blue' + + draw.rectangle(box,outline=color,width=5) + + return img1 + + def parse(self,prog_step): + parse_result = parse_step(prog_step.prog_str) + step_name = parse_result['step_name'] + img_var = parse_result['args']['image'] + obj_name = eval(parse_result['args']['object']) + output_var = parse_result['output_var'] + assert(step_name==self.step_name) + return img_var,obj_name,output_var + + def execute(self,prog_step,inspect=False): + img_var,obj_name,output_var = self.parse(prog_step) + img = prog_step.state[img_var] + + bboxes = self.forward(img,obj_name) + + box_img = self.box_image(img, bboxes) + + objs = dict( + box = bboxes, + category = obj_name, + img = img + ) + + prog_step.state[output_var] = objs + prog_step.state[output_var+'_IMAGE'] = box_img + if inspect: + html_str = self.html(img, box_img, output_var, obj_name) + return objs, html_str + + return objs class LocInterpreter(): step_name = 'LOC' - def __init__(self,thresh=0.1,nms_thresh=0.5): + def __init__(self, thresh=0.1,nms_thresh=0.5): print(f'Registering {self.step_name} step') - self.device = "cuda:0" if torch.cuda.is_available() else "cpu" - self.processor = OwlViTProcessor.from_pretrained( - "google/owlvit-large-patch14") - self.model = OwlViTForObjectDetection.from_pretrained( - "google/owlvit-large-patch14").to(self.device) - self.model.eval() + + model_id = "IDEA-Research/grounding-dino-base" + self.device = "cuda" if torch.cuda.is_available() else "cpu" + + self.processor = AutoProcessor.from_pretrained(model_id) + self.model = AutoModelForZeroShotObjectDetection.from_pretrained( + model_id).to(self.device) + + special_model_id = "facebook/detr-resnet-50" + self.special_processor = DetrImageProcessor.from_pretrained( + special_model_id, revision="no_timm") + self.special_model = DetrForObjectDetection.from_pretrained( + special_model_id, revision="no_timm") + self.thresh = thresh self.nms_thresh = nms_thresh @@ -225,19 +698,22 @@ def normalize_coord(self,bbox,img_size): return [x1,y1,x2,y2] def predict(self,img,obj_name): - encoding = self.processor( - text=[[f'a photo of {obj_name}']], - images=img, - return_tensors='pt') - encoding = {k:v.to(self.device) for k,v in encoding.items()} + prompt = f"a {obj_name}." + + inputs = self.processor( + images=img, text=prompt, return_tensors="pt").to(self.device) + with torch.no_grad(): - outputs = self.model(**encoding) - for k,v in outputs.items(): - if v is not None: - outputs[k] = v.to('cpu') if isinstance(v, torch.Tensor) else v + outputs = self.model(**inputs) + + results = self.processor.post_process_grounded_object_detection( + outputs, + inputs.input_ids, + box_threshold=0.4, + text_threshold=0.3, + target_sizes=[img.size[::-1]] + ) - target_sizes = torch.Tensor([img.size[::-1]]) - results = self.processor.post_process_object_detection(outputs=outputs,threshold=self.thresh,target_sizes=target_sizes) boxes, scores = results[0]["boxes"], results[0]["scores"] boxes = boxes.cpu().detach().numpy().tolist() scores = scores.cpu().detach().numpy().tolist() @@ -256,6 +732,36 @@ def predict(self,img,obj_name): selected_boxes, selected_scores = nms( selected_boxes,selected_scores,self.nms_thresh) return selected_boxes + + def special_predict(self, img): + inputs = self.special_processor( + images=img, return_tensors="pt").to(self.device) + + with torch.no_grad(): + outputs = self.special_model(**inputs) + + target_sizes = torch.tensor([img.size[::-1]]) + results = self.special_processor.post_process_object_detection( + outputs, target_sizes=target_sizes, threshold=0.9)[0] + + boxes, scores = results["boxes"], results["scores"] + boxes = boxes.tolist() + scores = scores.tolist() + if len(boxes)==0: + return [] + + boxes, scores = zip(*sorted(zip(boxes,scores),key=lambda x: x[1],reverse=True)) + selected_boxes = [] + selected_scores = [] + for i in range(len(scores)): + if scores[i] > self.thresh: + coord = self.normalize_coord(boxes[i],img.size) + selected_boxes.append(coord) + selected_scores.append(scores[i]) + + selected_boxes, selected_scores = nms( + selected_boxes,selected_scores,self.nms_thresh) + return selected_boxes def top_box(self,img): w,h = img.size @@ -307,17 +813,26 @@ def execute(self,prog_step,inspect=False): bboxes = [self.left_box(img)] elif obj_name=='RIGHT': bboxes = [self.right_box(img)] + elif obj_name=='object': + bboxes = self.special_predict(img) else: bboxes = self.predict(img,obj_name) box_img = self.box_image(img, bboxes) - prog_step.state[output_var] = bboxes + + objs = dict( + box = bboxes, + category = obj_name, + img = img + ) + + prog_step.state[output_var] = objs prog_step.state[output_var+'_IMAGE'] = box_img if inspect: html_str = self.html(img, box_img, output_var, obj_name) - return bboxes, html_str + return objs, html_str - return bboxes + return objs class Loc2Interpreter(LocInterpreter): @@ -414,6 +929,12 @@ def html(self,img,out_img,output_var,box_img): step_name = html_step_name(self.step_name) box_arg = html_arg_name('bbox') return f"""