diff --git a/.gitignore b/.gitignore index 264daca..2e04467 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -*__pycache__ \ No newline at end of file +*__pycache__ +*.ipynb \ No newline at end of file diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..f2d6bb9 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "GLIP"] + path = GLIP + url = https://github.com/Junital/GLIP.git diff --git a/GLIP b/GLIP new file mode 160000 index 0000000..c43038f --- /dev/null +++ b/GLIP @@ -0,0 +1 @@ +Subproject commit c43038f7a96e5515c125a2faa93bdf7ff6934a9f diff --git a/assets/COCO_train2014_000000022882.jpg b/assets/COCO_train2014_000000022882.jpg new file mode 100644 index 0000000..be36b62 Binary files /dev/null and b/assets/COCO_train2014_000000022882.jpg differ diff --git a/assets/COCO_train2014_000000269022.jpg b/assets/COCO_train2014_000000269022.jpg new file mode 100644 index 0000000..4bec226 Binary files /dev/null and b/assets/COCO_train2014_000000269022.jpg differ diff --git a/assets/COCO_train2014_000000576849.jpg b/assets/COCO_train2014_000000576849.jpg new file mode 100644 index 0000000..54c3037 Binary files /dev/null and b/assets/COCO_train2014_000000576849.jpg differ diff --git a/data/ExperImages.zip b/data/ExperImages.zip new file mode 100644 index 0000000..33ccd00 Binary files /dev/null and b/data/ExperImages.zip differ diff --git a/data/ExperImages/n103484.jpg b/data/ExperImages/n103484.jpg new file mode 100644 index 0000000..38d92b0 Binary files /dev/null and b/data/ExperImages/n103484.jpg differ diff --git a/data/ExperImages/n109961.jpg b/data/ExperImages/n109961.jpg new file mode 100644 index 0000000..f3773cf Binary files /dev/null and b/data/ExperImages/n109961.jpg differ diff --git a/data/ExperImages/n111074.jpg b/data/ExperImages/n111074.jpg new file mode 100644 index 0000000..ec71a56 Binary files /dev/null and b/data/ExperImages/n111074.jpg differ diff --git a/data/ExperImages/n115871.jpg b/data/ExperImages/n115871.jpg new file mode 100644 index 0000000..a64afba Binary files /dev/null and b/data/ExperImages/n115871.jpg differ diff --git a/data/ExperImages/n130253.jpg b/data/ExperImages/n130253.jpg new file mode 100644 index 0000000..38fda3f Binary files /dev/null and b/data/ExperImages/n130253.jpg differ diff --git a/data/ExperImages/n130371.jpg b/data/ExperImages/n130371.jpg new file mode 100644 index 0000000..3ad0334 Binary files /dev/null and b/data/ExperImages/n130371.jpg differ diff --git a/data/ExperImages/n133975.jpg b/data/ExperImages/n133975.jpg new file mode 100644 index 0000000..38a5d7c Binary files /dev/null and b/data/ExperImages/n133975.jpg differ diff --git a/data/ExperImages/n139491.jpg b/data/ExperImages/n139491.jpg new file mode 100644 index 0000000..10b8ecb Binary files /dev/null and b/data/ExperImages/n139491.jpg differ diff --git a/data/ExperImages/n147001.jpg b/data/ExperImages/n147001.jpg new file mode 100644 index 0000000..0230873 Binary files /dev/null and b/data/ExperImages/n147001.jpg differ diff --git a/data/ExperImages/n149859.jpg b/data/ExperImages/n149859.jpg new file mode 100644 index 0000000..9b66e60 Binary files /dev/null and b/data/ExperImages/n149859.jpg differ diff --git a/data/ExperImages/n154501.jpg b/data/ExperImages/n154501.jpg new file mode 100644 index 0000000..d3bf565 Binary files /dev/null and b/data/ExperImages/n154501.jpg differ diff --git a/data/ExperImages/n155297.jpg b/data/ExperImages/n155297.jpg new file mode 100644 index 0000000..2a3e3bd Binary files /dev/null and b/data/ExperImages/n155297.jpg differ diff --git a/data/ExperImages/n15719.jpg b/data/ExperImages/n15719.jpg new file mode 100644 index 0000000..739dc36 Binary files /dev/null and b/data/ExperImages/n15719.jpg differ diff --git a/data/ExperImages/n158542.jpg b/data/ExperImages/n158542.jpg new file mode 100644 index 0000000..4ffcefc Binary files /dev/null and b/data/ExperImages/n158542.jpg differ diff --git a/data/ExperImages/n170047.jpg b/data/ExperImages/n170047.jpg new file mode 100644 index 0000000..ff913a5 Binary files /dev/null and b/data/ExperImages/n170047.jpg differ diff --git a/data/ExperImages/n171693.jpg b/data/ExperImages/n171693.jpg new file mode 100644 index 0000000..293c803 Binary files /dev/null and b/data/ExperImages/n171693.jpg differ diff --git a/data/ExperImages/n173361.jpg b/data/ExperImages/n173361.jpg new file mode 100644 index 0000000..336f662 Binary files /dev/null and b/data/ExperImages/n173361.jpg differ diff --git a/data/ExperImages/n175869.jpg b/data/ExperImages/n175869.jpg new file mode 100644 index 0000000..8419275 Binary files /dev/null and b/data/ExperImages/n175869.jpg differ diff --git a/data/ExperImages/n178815.jpg b/data/ExperImages/n178815.jpg new file mode 100644 index 0000000..e12f8a8 Binary files /dev/null and b/data/ExperImages/n178815.jpg differ diff --git a/data/ExperImages/n181615.jpg b/data/ExperImages/n181615.jpg new file mode 100644 index 0000000..1362abc Binary files /dev/null and b/data/ExperImages/n181615.jpg differ diff --git a/data/ExperImages/n182120.jpg b/data/ExperImages/n182120.jpg new file mode 100644 index 0000000..11d2cdd Binary files /dev/null and b/data/ExperImages/n182120.jpg differ diff --git a/data/ExperImages/n184739.jpg b/data/ExperImages/n184739.jpg new file mode 100644 index 0000000..59b586c Binary files /dev/null and b/data/ExperImages/n184739.jpg differ diff --git a/data/ExperImages/n188669.jpg b/data/ExperImages/n188669.jpg new file mode 100644 index 0000000..3b94d55 Binary files /dev/null and b/data/ExperImages/n188669.jpg differ diff --git a/data/ExperImages/n189986.jpg b/data/ExperImages/n189986.jpg new file mode 100644 index 0000000..224be63 Binary files /dev/null and b/data/ExperImages/n189986.jpg differ diff --git a/data/ExperImages/n194711.jpg b/data/ExperImages/n194711.jpg new file mode 100644 index 0000000..d388188 Binary files /dev/null and b/data/ExperImages/n194711.jpg differ diff --git a/data/ExperImages/n196089.jpg b/data/ExperImages/n196089.jpg new file mode 100644 index 0000000..9622186 Binary files /dev/null and b/data/ExperImages/n196089.jpg differ diff --git a/data/ExperImages/n20290.jpg b/data/ExperImages/n20290.jpg new file mode 100644 index 0000000..0e73bec Binary files /dev/null and b/data/ExperImages/n20290.jpg differ diff --git a/data/ExperImages/n210059.jpg b/data/ExperImages/n210059.jpg new file mode 100644 index 0000000..b503d11 Binary files /dev/null and b/data/ExperImages/n210059.jpg differ diff --git a/data/ExperImages/n210277.jpg b/data/ExperImages/n210277.jpg new file mode 100644 index 0000000..a5d91cc Binary files /dev/null and b/data/ExperImages/n210277.jpg differ diff --git a/data/ExperImages/n214414.jpg b/data/ExperImages/n214414.jpg new file mode 100644 index 0000000..027767f Binary files /dev/null and b/data/ExperImages/n214414.jpg differ diff --git a/data/ExperImages/n215517.jpg b/data/ExperImages/n215517.jpg new file mode 100644 index 0000000..e0e67bc Binary files /dev/null and b/data/ExperImages/n215517.jpg differ diff --git a/data/ExperImages/n229656.jpg b/data/ExperImages/n229656.jpg new file mode 100644 index 0000000..01fa258 Binary files /dev/null and b/data/ExperImages/n229656.jpg differ diff --git a/data/ExperImages/n239383.jpg b/data/ExperImages/n239383.jpg new file mode 100644 index 0000000..3c147a1 Binary files /dev/null and b/data/ExperImages/n239383.jpg differ diff --git a/data/ExperImages/n24913.jpg b/data/ExperImages/n24913.jpg new file mode 100644 index 0000000..9b6c98e Binary files /dev/null and b/data/ExperImages/n24913.jpg differ diff --git a/data/ExperImages/n253231.jpg b/data/ExperImages/n253231.jpg new file mode 100644 index 0000000..3c31c2c Binary files /dev/null and b/data/ExperImages/n253231.jpg differ diff --git a/data/ExperImages/n258003.jpg b/data/ExperImages/n258003.jpg new file mode 100644 index 0000000..82b34f0 Binary files /dev/null and b/data/ExperImages/n258003.jpg differ diff --git a/data/ExperImages/n261100.jpg b/data/ExperImages/n261100.jpg new file mode 100644 index 0000000..c43ab19 Binary files /dev/null and b/data/ExperImages/n261100.jpg differ diff --git a/data/ExperImages/n264509.jpg b/data/ExperImages/n264509.jpg new file mode 100644 index 0000000..e734049 Binary files /dev/null and b/data/ExperImages/n264509.jpg differ diff --git a/data/ExperImages/n266971.jpg b/data/ExperImages/n266971.jpg new file mode 100644 index 0000000..e170f05 Binary files /dev/null and b/data/ExperImages/n266971.jpg differ diff --git a/data/ExperImages/n267826.jpg b/data/ExperImages/n267826.jpg new file mode 100644 index 0000000..64e7161 Binary files /dev/null and b/data/ExperImages/n267826.jpg differ diff --git a/data/ExperImages/n274318.jpg b/data/ExperImages/n274318.jpg new file mode 100644 index 0000000..3dd7a7f Binary files /dev/null and b/data/ExperImages/n274318.jpg differ diff --git a/data/ExperImages/n275523.jpg b/data/ExperImages/n275523.jpg new file mode 100644 index 0000000..0907916 Binary files /dev/null and b/data/ExperImages/n275523.jpg differ diff --git a/data/ExperImages/n287162.jpg b/data/ExperImages/n287162.jpg new file mode 100644 index 0000000..4a2c104 Binary files /dev/null and b/data/ExperImages/n287162.jpg differ diff --git a/data/ExperImages/n298165.jpg b/data/ExperImages/n298165.jpg new file mode 100644 index 0000000..68e53d4 Binary files /dev/null and b/data/ExperImages/n298165.jpg differ diff --git a/data/ExperImages/n299577.jpg b/data/ExperImages/n299577.jpg new file mode 100644 index 0000000..6844b44 Binary files /dev/null and b/data/ExperImages/n299577.jpg differ diff --git a/data/ExperImages/n307753.jpg b/data/ExperImages/n307753.jpg new file mode 100644 index 0000000..6ffd362 Binary files /dev/null and b/data/ExperImages/n307753.jpg differ diff --git a/data/ExperImages/n311711.jpg b/data/ExperImages/n311711.jpg new file mode 100644 index 0000000..3d01a7c Binary files /dev/null and b/data/ExperImages/n311711.jpg differ diff --git a/data/ExperImages/n312992.jpg b/data/ExperImages/n312992.jpg new file mode 100644 index 0000000..45fa9dc Binary files /dev/null and b/data/ExperImages/n312992.jpg differ diff --git a/data/ExperImages/n318563.jpg b/data/ExperImages/n318563.jpg new file mode 100644 index 0000000..757b186 Binary files /dev/null and b/data/ExperImages/n318563.jpg differ diff --git a/data/ExperImages/n324512.jpg b/data/ExperImages/n324512.jpg new file mode 100644 index 0000000..8fc1d1e Binary files /dev/null and b/data/ExperImages/n324512.jpg differ diff --git a/data/ExperImages/n339300.jpg b/data/ExperImages/n339300.jpg new file mode 100644 index 0000000..2ad9807 Binary files /dev/null and b/data/ExperImages/n339300.jpg differ diff --git a/data/ExperImages/n339728.jpg b/data/ExperImages/n339728.jpg new file mode 100644 index 0000000..8d04212 Binary files /dev/null and b/data/ExperImages/n339728.jpg differ diff --git a/data/ExperImages/n341278.jpg b/data/ExperImages/n341278.jpg new file mode 100644 index 0000000..16b119b Binary files /dev/null and b/data/ExperImages/n341278.jpg differ diff --git a/data/ExperImages/n343034.jpg b/data/ExperImages/n343034.jpg new file mode 100644 index 0000000..264d7cd Binary files /dev/null and b/data/ExperImages/n343034.jpg differ diff --git a/data/ExperImages/n345315.jpg b/data/ExperImages/n345315.jpg new file mode 100644 index 0000000..cd90c7b Binary files /dev/null and b/data/ExperImages/n345315.jpg differ diff --git a/data/ExperImages/n345363.jpg b/data/ExperImages/n345363.jpg new file mode 100644 index 0000000..1dc750b Binary files /dev/null and b/data/ExperImages/n345363.jpg differ diff --git a/data/ExperImages/n349224.jpg b/data/ExperImages/n349224.jpg new file mode 100644 index 0000000..493ad91 Binary files /dev/null and b/data/ExperImages/n349224.jpg differ diff --git a/data/ExperImages/n366949.jpg b/data/ExperImages/n366949.jpg new file mode 100644 index 0000000..5574ba2 Binary files /dev/null and b/data/ExperImages/n366949.jpg differ diff --git a/data/ExperImages/n383044.jpg b/data/ExperImages/n383044.jpg new file mode 100644 index 0000000..78d999c Binary files /dev/null and b/data/ExperImages/n383044.jpg differ diff --git a/data/ExperImages/n394813.jpg b/data/ExperImages/n394813.jpg new file mode 100644 index 0000000..76fdb55 Binary files /dev/null and b/data/ExperImages/n394813.jpg differ diff --git a/data/ExperImages/n406179.jpg b/data/ExperImages/n406179.jpg new file mode 100644 index 0000000..fd8003a Binary files /dev/null and b/data/ExperImages/n406179.jpg differ diff --git a/data/ExperImages/n408516.jpg b/data/ExperImages/n408516.jpg new file mode 100644 index 0000000..5ad0d9f Binary files /dev/null and b/data/ExperImages/n408516.jpg differ diff --git a/data/ExperImages/n409008.jpg b/data/ExperImages/n409008.jpg new file mode 100644 index 0000000..ed994af Binary files /dev/null and b/data/ExperImages/n409008.jpg differ diff --git a/data/ExperImages/n41686.jpg b/data/ExperImages/n41686.jpg new file mode 100644 index 0000000..24e529e Binary files /dev/null and b/data/ExperImages/n41686.jpg differ diff --git a/data/ExperImages/n424704.jpg b/data/ExperImages/n424704.jpg new file mode 100644 index 0000000..6dd7044 Binary files /dev/null and b/data/ExperImages/n424704.jpg differ diff --git a/data/ExperImages/n437038.jpg b/data/ExperImages/n437038.jpg new file mode 100644 index 0000000..fc327b0 Binary files /dev/null and b/data/ExperImages/n437038.jpg differ diff --git a/data/ExperImages/n464936.jpg b/data/ExperImages/n464936.jpg new file mode 100644 index 0000000..677b950 Binary files /dev/null and b/data/ExperImages/n464936.jpg differ diff --git a/data/ExperImages/n466504.jpg b/data/ExperImages/n466504.jpg new file mode 100644 index 0000000..6a877bb Binary files /dev/null and b/data/ExperImages/n466504.jpg differ diff --git a/data/ExperImages/n468138.jpg b/data/ExperImages/n468138.jpg new file mode 100644 index 0000000..2396fcd Binary files /dev/null and b/data/ExperImages/n468138.jpg differ diff --git a/data/ExperImages/n474949.jpg b/data/ExperImages/n474949.jpg new file mode 100644 index 0000000..3c7c8e3 Binary files /dev/null and b/data/ExperImages/n474949.jpg differ diff --git a/data/ExperImages/n475122.jpg b/data/ExperImages/n475122.jpg new file mode 100644 index 0000000..d438334 Binary files /dev/null and b/data/ExperImages/n475122.jpg differ diff --git a/data/ExperImages/n479684.jpg b/data/ExperImages/n479684.jpg new file mode 100644 index 0000000..b8cc80e Binary files /dev/null and b/data/ExperImages/n479684.jpg differ diff --git a/data/ExperImages/n487547.jpg b/data/ExperImages/n487547.jpg new file mode 100644 index 0000000..ce59d37 Binary files /dev/null and b/data/ExperImages/n487547.jpg differ diff --git a/data/ExperImages/n488098.jpg b/data/ExperImages/n488098.jpg new file mode 100644 index 0000000..5ad32a3 Binary files /dev/null and b/data/ExperImages/n488098.jpg differ diff --git a/data/ExperImages/n49911.jpg b/data/ExperImages/n49911.jpg new file mode 100644 index 0000000..213dc3f Binary files /dev/null and b/data/ExperImages/n49911.jpg differ diff --git a/data/ExperImages/n511793.jpg b/data/ExperImages/n511793.jpg new file mode 100644 index 0000000..08afe81 Binary files /dev/null and b/data/ExperImages/n511793.jpg differ diff --git a/data/ExperImages/n51303.jpg b/data/ExperImages/n51303.jpg new file mode 100644 index 0000000..f94f76b Binary files /dev/null and b/data/ExperImages/n51303.jpg differ diff --git a/data/ExperImages/n513747.jpg b/data/ExperImages/n513747.jpg new file mode 100644 index 0000000..2f3001a Binary files /dev/null and b/data/ExperImages/n513747.jpg differ diff --git a/data/ExperImages/n514077.jpg b/data/ExperImages/n514077.jpg new file mode 100644 index 0000000..78d4fba Binary files /dev/null and b/data/ExperImages/n514077.jpg differ diff --git a/data/ExperImages/n515157.jpg b/data/ExperImages/n515157.jpg new file mode 100644 index 0000000..049813a Binary files /dev/null and b/data/ExperImages/n515157.jpg differ diff --git a/data/ExperImages/n524673.jpg b/data/ExperImages/n524673.jpg new file mode 100644 index 0000000..76187e3 Binary files /dev/null and b/data/ExperImages/n524673.jpg differ diff --git a/data/ExperImages/n525013.jpg b/data/ExperImages/n525013.jpg new file mode 100644 index 0000000..0a1555b Binary files /dev/null and b/data/ExperImages/n525013.jpg differ diff --git a/data/ExperImages/n536090.jpg b/data/ExperImages/n536090.jpg new file mode 100644 index 0000000..42fae4d Binary files /dev/null and b/data/ExperImages/n536090.jpg differ diff --git a/data/ExperImages/n537813.jpg b/data/ExperImages/n537813.jpg new file mode 100644 index 0000000..d11b18a Binary files /dev/null and b/data/ExperImages/n537813.jpg differ diff --git a/data/ExperImages/n544799.jpg b/data/ExperImages/n544799.jpg new file mode 100644 index 0000000..3a92f44 Binary files /dev/null and b/data/ExperImages/n544799.jpg differ diff --git a/data/ExperImages/n550668.jpg b/data/ExperImages/n550668.jpg new file mode 100644 index 0000000..5b9d4a4 Binary files /dev/null and b/data/ExperImages/n550668.jpg differ diff --git a/data/ExperImages/n553018.jpg b/data/ExperImages/n553018.jpg new file mode 100644 index 0000000..999fba3 Binary files /dev/null and b/data/ExperImages/n553018.jpg differ diff --git a/data/ExperImages/n554025.jpg b/data/ExperImages/n554025.jpg new file mode 100644 index 0000000..c91c4c9 Binary files /dev/null and b/data/ExperImages/n554025.jpg differ diff --git a/data/ExperImages/n557683.jpg b/data/ExperImages/n557683.jpg new file mode 100644 index 0000000..83581d2 Binary files /dev/null and b/data/ExperImages/n557683.jpg differ diff --git a/data/ExperImages/n560895.jpg b/data/ExperImages/n560895.jpg new file mode 100644 index 0000000..4576ba5 Binary files /dev/null and b/data/ExperImages/n560895.jpg differ diff --git a/data/ExperImages/n56556.jpg b/data/ExperImages/n56556.jpg new file mode 100644 index 0000000..b526630 Binary files /dev/null and b/data/ExperImages/n56556.jpg differ diff --git a/data/ExperImages/n565573.jpg b/data/ExperImages/n565573.jpg new file mode 100644 index 0000000..b3922c2 Binary files /dev/null and b/data/ExperImages/n565573.jpg differ diff --git a/data/ExperImages/n568258.jpg b/data/ExperImages/n568258.jpg new file mode 100644 index 0000000..b9c1330 Binary files /dev/null and b/data/ExperImages/n568258.jpg differ diff --git a/data/ExperImages/n59657.jpg b/data/ExperImages/n59657.jpg new file mode 100644 index 0000000..9e98dd6 Binary files /dev/null and b/data/ExperImages/n59657.jpg differ diff --git a/data/ExperImages/n59853.jpg b/data/ExperImages/n59853.jpg new file mode 100644 index 0000000..2cf18cb Binary files /dev/null and b/data/ExperImages/n59853.jpg differ diff --git a/data/ExperImages/n60178.jpg b/data/ExperImages/n60178.jpg new file mode 100644 index 0000000..5ee3f22 Binary files /dev/null and b/data/ExperImages/n60178.jpg differ diff --git a/data/ExperImages/n61019.jpg b/data/ExperImages/n61019.jpg new file mode 100644 index 0000000..5d026f8 Binary files /dev/null and b/data/ExperImages/n61019.jpg differ diff --git a/data/ExperImages/n66626.jpg b/data/ExperImages/n66626.jpg new file mode 100644 index 0000000..c046837 Binary files /dev/null and b/data/ExperImages/n66626.jpg differ diff --git a/data/ExperImages/n93063.jpg b/data/ExperImages/n93063.jpg new file mode 100644 index 0000000..f1d7c68 Binary files /dev/null and b/data/ExperImages/n93063.jpg differ diff --git a/data/problem.xlsx b/data/problem.xlsx new file mode 100644 index 0000000..f1e7cf4 Binary files /dev/null and b/data/problem.xlsx differ diff --git a/data/test.csv b/data/test.csv new file mode 100644 index 0000000..3662a71 --- /dev/null +++ b/data/test.csv @@ -0,0 +1,101 @@ +isBalanced,question,imageId +True,Which kind of furniture is below the decoration?,n274318 +True,Which kind of clothing is warm?,n115871 +True,Which kind of clothing is not black?,n184739 +True,Do the pants that are not dirty look large?,n66626 +True,The lamp that is not turned-off is sitting on top of what?,n210059 +True,What kind of furniture is made of metal?,n188669 +True,Is the shirt orange or blue?,n554025 +True,Do you see either any white towels or pillows?,n468138 +True,Is the closed drawer to the right of a chair?,n15719 +True,What is the lamp made of?,n133975 +True,What does the man wear?,n51303 +True,What is under the cooking utensil made of wood?,n349224 +True,Are the end table and the desk made of the same material?,n194711 +True,Is the counter above a drawer?,n229656 +True,Are the pants black and long?,n149859 +True,What is the cup to the left of the keyboard made of?,n475122 +True,Is the standing woman behind the tomatoes wearing a hat?,n299577 +True,Where is the man in front of the fence standing on?,n464936 +True,What appliance is not used?,n409008 +True,Do you see any women inside the library?,n93063 +True,How large is the device the computer monitor is beside of?,n479684 +True,The bun is on what?,n24913 +True,What is the boy doing?,n49911 +True,Is there a bed or a desk in this picture?,n170047 +True,Does the gate look metallic and tall?,n196089 +True,Does the mirror look clean and brown?,n266971 +True,What is in front of the street light?,n511793 +True,Is the helmet on the right?,n339728 +True,Are there white desks in the picture?,n345363 +True,Is the house both white and small?,n109961 +True,Is the bag behind a chair?,n214414 +True,Is the garbage can to the left of the people?,n147001 +True,Are the benches in front of the person hard and red?,n324512 +True,Is there a train above the street that is made of brick?,n253231 +True,The tomatoes are in what?,n557683 +True,Is the plate on a counter?,n318563 +True,Are there any women in the photo that are not riding?,n59657 +True,What is the piece of furniture that is hanging above the wall called?,n61019 +True,Is the plastic bucket to the right or to the left of the toilet that is white?,n171693 +True,What is the man wearing?,n275523 +True,Is there a blue window in this picture?,n189986 +True,Does the dress look short sleeved?,n515157 +True,How large is the pitcher that the ketchup is in front of?,n158542 +True,What is in front of the trees?,n139491 +True,Does the person that is not old wear a hat?,n560895 +True,Is the happy person on the left or on the right side of the image?,n383044 +True,Is the empty bottle on the right side or on the left?,n60178 +True,Which kind of clothing is light colored?,n111074 +True,Do you see tables in the photo?,n154501 +True,What animal is standing on the small boat?,n178815 +True,What is the plate near the candle holder sitting atop?,n264509 +True,Which place is it?,n424704 +True,What is the device in front of the person that is sitting on the ground called?,n20290 +True,Do you see end tables next to the sofa on the left part of the photo?,n307753 +True,What is under the windows?,n341278 +True,Does that sweatshirt have striped pattern and gray color?,n345315 +True,What is the color of the toilet paper?,n287162 +True,What's the man doing?,n513747 +True,Which device is on?,n59853 +True,Are there black sandals or boots?,n41686 +True,Is the plastic cup to the left of the other cup small and colorful?,n215517 +True,What kind of furniture are the drawers in?,n133975 +True,Are the men to the left of a bowl?,n298165 +True,What animal is brown?,n406179 +True,What is the color of the wine glasses?,n544799 +True,Are there both cars and fences in the photo?,n130253 +True,Are there either any plates or breads in the image?,n267826 +True,What is on the motorcycle?,n339300 +True,How large are the rocks?,n408516 +True,Which kind of food is bigger than the blueberry?,n466504 +True,What is the woman wearing?,n537813 +True,What does the man wear?,n366949 +True,"Which side of the picture are the bags on, the left or the right?",n568258 +True,Where is the snowboarder standing on?,n553018 +True,How is this vehicle called?,n487547 +True,Who is holding onto the umbrella?,n103484 +True,What is the toilet brush next to the toilet made of?,n514077 +True,Who in this image is talking?,n343034 +True,What type of fast food is the woman that is not old looking at?,n394813 +True,Are the daughter that is not old and the daughter to the right of the dad both happy?,n155297 +True,Are the people that are talking leaning on the wood fence?,n181615 +True,Is the clock both round and gold?,n239383 +True,Is the girl to the left of the backpack waiting or playing?,n437038 +True,Is the table gray?,n56556 +True,What is the color of the bread that is not little?,n312992 +True,What is in front of the trees?,n536090 +True,Is that street sign black and small?,n311711 +True,Is that fork made of stainless steel?,n474949 +True,What is the weather like in the picture?,n130371 +True,Is there a mug on top of the bench?,n261100 +True,Is the sand both black and wet?,n175869 +True,What is the aircraft that the woman is looking at?,n524673 +True,What pieces of furniture are to the left of the car?,n173361 +True,Who in this photograph is looking down?,n488098 +True,Is the sink short or tall?,n182120 +True,What is that fence in front of?,n258003 +True,Are there any chairs near the decorative painting?,n210277 +True,How clean is the headband that the girl is wearing?,n525013 +True,Is the small building behind or in front of the bushy tree?,n550668 +True,Does the umbrella look female and white?,n565573 diff --git a/data/visualization.xlsx b/data/visualization.xlsx new file mode 100644 index 0000000..97de500 Binary files /dev/null and b/data/visualization.xlsx differ diff --git a/download_GLIP.sh b/download_GLIP.sh new file mode 100644 index 0000000..6e78006 --- /dev/null +++ b/download_GLIP.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# change this to your preferred download location +PRETRAINED_MODELS_PATH=./pretrained_models + +# GLIP model +mkdir -p $PRETRAINED_MODELS_PATH/GLIP/checkpoints +mkdir -p $PRETRAINED_MODELS_PATH/GLIP/configs +wget -nc -P $PRETRAINED_MODELS_PATH/GLIP/checkpoints https://huggingface.co/GLIPModel/GLIP/resolve/main/glip_large_model.pth +wget -nc -P $PRETRAINED_MODELS_PATH/GLIP/configs https://raw.githubusercontent.com/microsoft/GLIP/main/configs/pretrain/glip_Swin_L.yaml diff --git a/engine/object_graph.py b/engine/object_graph.py new file mode 100644 index 0000000..ed7c593 --- /dev/null +++ b/engine/object_graph.py @@ -0,0 +1,39 @@ +class ObjectGraph: + + def __init__(self, Name: str, Box, + Category : str = "object", + Location : tuple = (0, 0), + Size : tuple = (0, 0)) -> None: + + self.Attribute = dict( + Name = Name, + Box = Box, + Category = Category, + Location = Location, + Size = Size + ) + + def add(self, key: str, value: any) -> None: + + self.Attribute[key] = value + +class ObjectGraphGroup: + + def __init__(self, img=None, groupA=None, groupB=None): + if img is not None: + self.Graphs = [] + self.Relations = dict() + self.Img = img + elif groupA is not None and groupB is not None: + self.Graphs = groupA.Graphs + groupB.Graphs + self.Relations = groupA.Relations | groupB.Relations + self.Img = groupA.Img + else: + raise ValueError("You must give a image or give two groups.") + + def add_graph(self, graph: ObjectGraph): + + self.Graphs.append(graph) + + def add_relation(self, objA, objB, relation): + self.Relations[(objA, objB)] = relation diff --git a/engine/step_interpreters.py b/engine/step_interpreters.py index c588497..3f0584a 100644 --- a/engine/step_interpreters.py +++ b/engine/step_interpreters.py @@ -1,7 +1,10 @@ import cv2 import os import torch -import openai +import sys +import timeit +from typing import Union +from openai import OpenAI import functools import numpy as np import face_detection @@ -11,9 +14,14 @@ from PIL import Image,ImageDraw,ImageFont,ImageFilter from transformers import (ViltProcessor, ViltForQuestionAnswering, OwlViTProcessor, OwlViTForObjectDetection, + AutoModelForZeroShotObjectDetection, DetrImageProcessor, DetrForObjectDetection, MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation, CLIPProcessor, CLIPModel, AutoProcessor, BlipForQuestionAnswering) from diffusers import StableDiffusionInpaintPipeline +from .object_graph import ObjectGraph, ObjectGraphGroup +import re +from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo, to_image_list, create_positive_map, \ + create_positive_map_label_to_token_from_positive_map from .nms import nms from vis_utils import html_embed_image, html_colored_span, vis_masks @@ -191,18 +199,483 @@ def execute(self,prog_step,inspect=False): return answer +class BuildInterpreter(): + step_name = "BUILD" + + def __init__(self): + self.vqa = VQAInterpreter() + + self.crop = CropInterpreter() + + def parse(self,prog_step): + parse_result = parse_step(prog_step.prog_str) + step_name = parse_result['step_name'] + objs_var = parse_result['args']['objects'] + output_var = parse_result['output_var'] + assert(step_name==self.step_name) + return objs_var,output_var + + def execute(self, prog_step,inspect=False): + objs_var, output_var = self.parse(prog_step) + + objs = prog_step.state[objs_var] + img = objs["img"] + category = objs["category"] + + graphs = ObjectGraphGroup(img) + + for obj in objs["box"]: + croped_img = self.crop.cropbox(obj, img) + + answer = self.vqa.predict(croped_img, f"What's this {category}?") + + mid_w, mid_h = (obj[0] + obj[2]) / 2, (obj[1] + obj[3]) / 2 + w, h = img.size + location = (mid_w / w, mid_h / h) + size = (obj[2] - obj[0]), obj[3] - obj[1] + + graphs.add_graph(ObjectGraph(answer, obj, category, location, size)) + + print(graphs.Graphs) + prog_step.state[output_var] = graphs + +class ADDInterpreter(): + step_name = 'ADD' + + def __init__(self): + self.vqa = VQAInterpreter() + + def parse(self,prog_step): + parse_result = parse_step(prog_step.prog_str) + step_name = parse_result['step_name'] + graph_var = parse_result['args']['graph'] + attribute_str = parse_result['args']['attribute'] + output_var = parse_result['output_var'] + assert(step_name==self.step_name) + return graph_var,attribute_str,output_var + + def execute(self, prog_step,inspect=False): + graph_var, attribute_str, output_var = self.parse(prog_step) + + graph = prog_step.state[graph_var] + img = graph.Img + + res_graph = ObjectGraphGroup(img=img) + + for obj in graph.Graphs: + if attribute_str in obj.Attribute: + pass + else: + cropped_img = img.crop(obj.Attribute["Box"]) + obj_name = obj.Attribute["Name"] + answer = self.vqa.predict(cropped_img, f"What's the {attribute_str} of this {obj_name}?") + + obj.add(attribute_str, answer) + + res_graph.add_graph(obj) + + prog_step.state[output_var] = res_graph + + return res_graph + +class MERGEInterpreter(): + step_name = 'MERGE' + + RELATION_MESSAGE = [ + { + "role": "system", + "content": """Given the relationship (subject, object): relationship, generate a question that asks about this relationship. + +For example: +**Input**: (bottles, wine): right_of +**Output**: "Is the bottles to the right of the wine?" + + +3. should be converted to "Is the book under the table?" + +Please follow this format to create the questions. +""" + }, + { + "role": "user", + "content": """**Input:** (people, umbrella): carry""" + }, + { + "role": "assistant", + "content": """**Output:** "Is the people carrying the umbrella?" """ + }, + { + "role": "user", + "content": """**Input:** (book, table): under""" + }, + { + "role": "assistant", + "content": """**Output:** "Is the book under the table?" """ + } + ] + + def __init__(self): + self.vqa = VQAInterpreter() + self.client = OpenAI() + + def parse(self,prog_step): + parse_result = parse_step(prog_step.prog_str) + step_name = parse_result['step_name'] + graph1_var = parse_result['args']['graphA'] + graph2_var = parse_result['args']['graphB'] + relation_str = parse_result['args']['relation'] + output_var = parse_result['output_var'] + assert(step_name==self.step_name) + return graph1_var,graph2_var,relation_str,output_var + + def llm_ask(self, relation): + message = self.RELATION_MESSAGE + message.append({"role": "user", "content": f"**Input:** {relation}"}) + + response = self.client.chat.completions.create( + model="gpt-4o", + temperature=0.8, + messages=message + ) + + answer = response.choices[0].message.content + + matches = re.findall(r'"([^}]*)"', answer) + + if len(matches) > 0: + return matches[0] + + return "" + + def get_categories(self, group): + categories = [] + + for graph in group.Graphs: + if graph.Attribute["Category"] not in categories: + categories.append(graph.Attribute["Category"]) + + return categories + + def focus_image(self, img, box1, box2): + black_image = Image.new('RGB', img.size, (0, 0, 0)) + + region1 = img.crop(box1) + black_image.paste(region1, (box1[0], box1[1])) + + region2 = img.crop(box2) + black_image.paste(region2, (box2[0], box2[1])) + + return black_image + + def execute(self, prog_step,inspect=False): + graph1_var, graph2_var, relation_str, output_var = self.parse(prog_step) + + graph1 = prog_step.state[graph1_var] + graph2 = prog_step.state[graph2_var] + + if(relation_str == "None"): + + merged_graph = ObjectGraphGroup(groupA=graph1, groupB=graph2) + + else: + merged_graph = ObjectGraphGroup(groupA=graph1, groupB=graph2) + img = merged_graph.Img + + for objA in graph1.Graphs: + for objB in graph2.Graphs: + new_img = self.focus_image(img, objA.Attribute["Box"], objB.Attribute["Box"]) + new_img.save("test.jpg") + + Aname = objA.Attribute["Name"] + Bname = objB.Attribute["Name"] + + question = self.llm_ask(f"({Aname}, {Bname}): {relation_str}") + print(question) + + answer = self.vqa.predict(new_img, question) + + if(answer == "yes"): + merged_graph.add_relation(objA, objB, relation_str) + + prog_step.state[output_var] = merged_graph + + return merged_graph + +class HiddenPrints: + hide_prints = False + + def __init__(self, model_name=None, console=None, use_newline=True): + self.model_name = model_name + self.console = console + self.use_newline = use_newline + self.tqdm_aux = None + + def __enter__(self): + if self.hide_prints: + import tqdm # We need to do an extra step to hide tqdm outputs. Does not work in Jupyter Notebooks. + + def nop(it, *a, **k): + return it + + self.tqdm_aux = tqdm.tqdm + tqdm.tqdm = nop + + if self.model_name is not None: + self.console.print(f'Loading {self.model_name}...') + self._original_stdout = sys.stdout + self._original_stderr = sys.stderr + sys.stdout = open(os.devnull, 'w') + # May not be what we always want, but some annoying warnings end up to stderr + sys.stderr = open(os.devnull, 'w') + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.hide_prints: + sys.stdout.close() + sys.stdout = self._original_stdout + sys.stdout = self._original_stderr + if self.model_name is not None: + self.console.print(f'{self.model_name} loaded ') + import tqdm + tqdm.tqdm = self.tqdm_aux + +class GLIPLocInterpreter(GLIPDemo): + step_name = 'LOC' + + def __init__(self, *args_demo): + + working_dir = f'./pretrained_models/GLIP/' + + config_file = working_dir + "configs/glip_Swin_L.yaml" + weight_file = working_dir + "checkpoints/glip_large_model.pth" + + kwargs = { + 'min_image_size': 800, + 'confidence_threshold': 0.5, + 'show_mask_heatmaps': False + } + + self.dev = "cuda" if torch.cuda.is_available() else "cpu" + + from maskrcnn_benchmark.config import cfg + + # manual override some options + cfg.local_rank = 0 + cfg.num_gpus = 1 + cfg.merge_from_file(config_file) + cfg.merge_from_list(["MODEL.WEIGHT", weight_file]) + cfg.merge_from_list(["MODEL.DEVICE", self.dev]) + + with HiddenPrints("GLIP"), torch.cuda.device(self.dev): + from transformers.utils import logging + logging.set_verbosity_error() + GLIPDemo.__init__(self, cfg, *args_demo, **kwargs) + if self.cfg.MODEL.RPN_ARCHITECTURE == "VLDYHEAD": + plus = 1 + else: + plus = 0 + self.plus = plus + self.color = 255 + + @torch.no_grad() + def compute_prediction(self, original_image, original_caption, custom_entity=None): + image = self.transforms(original_image) + # image = [image, image.permute(0, 2, 1)] + image_list = to_image_list(image, self.cfg.DATALOADER.SIZE_DIVISIBILITY) + image_list = image_list.to(self.dev) + # caption + if isinstance(original_caption, list): + + if len(original_caption) > 40: + all_predictions = None + for loop_num, i in enumerate(range(0, len(original_caption), 40)): + list_step = original_caption[i:i + 40] + prediction_step = self.compute_prediction(original_image, list_step, custom_entity=None) + if all_predictions is None: + all_predictions = prediction_step + else: + # Aggregate predictions + all_predictions.bbox = torch.cat((all_predictions.bbox, prediction_step.bbox), dim=0) + for k in all_predictions.extra_fields: + all_predictions.extra_fields[k] = \ + torch.cat((all_predictions.extra_fields[k], + prediction_step.extra_fields[k] + loop_num), dim=0) + return all_predictions + + # we directly provided a list of category names + caption_string = "" + tokens_positive = [] + seperation_tokens = " . " + for word in original_caption: + tokens_positive.append([len(caption_string), len(caption_string) + len(word)]) + caption_string += word + caption_string += seperation_tokens + + tokenized = self.tokenizer([caption_string], return_tensors="pt") + # tokens_positive = [tokens_positive] # This was wrong + tokens_positive = [[v] for v in tokens_positive] + + original_caption = caption_string + # print(tokens_positive) + else: + tokenized = self.tokenizer([original_caption], return_tensors="pt") + if custom_entity is None: + tokens_positive = self.run_ner(original_caption) + # print(tokens_positive) + # process positive map + positive_map = create_positive_map(tokenized, tokens_positive) + + positive_map_label_to_token = create_positive_map_label_to_token_from_positive_map(positive_map, + plus=self.plus) + self.positive_map_label_to_token = positive_map_label_to_token + tic = timeit.time.perf_counter() + + # compute predictions + with HiddenPrints(): # Hide some deprecated notices + predictions = self.model(image_list, captions=[original_caption], + positive_map=positive_map_label_to_token) + predictions = [o.to(self.cpu_device) for o in predictions] + # print("inference time per image: {}".format(timeit.time.perf_counter() - tic)) + + # always single image is passed at a time + prediction = predictions[0] + + # reshape prediction (a BoxList) into the original image size + height, width = original_image.shape[-2:] + # if self.tensor_inputs: + # else: + # height, width = original_image.shape[:-1] + prediction = prediction.resize((width, height)) + + if prediction.has_field("mask"): + # if we have masks, paste the masks in the right position + # in the image, as defined by the bounding boxes + masks = prediction.get_field("mask") + # always single image is passed at a time + masks = self.masker([masks], [prediction])[0] + prediction.add_field("mask", masks) + + return prediction + + @staticmethod + def to_left_right_upper_lower(bboxes): + return [(bbox[1], bbox[3], bbox[0], bbox[2]) for bbox in bboxes] + + @staticmethod + def to_xmin_ymin_xmax_ymax(bboxes): + # invert the previous method + return [(bbox[2], bbox[0], bbox[3], bbox[1]) for bbox in bboxes] + + @staticmethod + def prepare_image(image): + image = image[[2, 1, 0]] # convert to bgr for opencv-format for glip + return image + + @torch.no_grad() + def forward(self, image: torch.Tensor, obj: Union[str, list], return_labels: bool = False, + confidence_threshold=None): + + if confidence_threshold is not None: + original_confidence_threshold = self.confidence_threshold + self.confidence_threshold = confidence_threshold + + # if isinstance(object, list): + # object = ' . '.join(object) + ' .' # add separation tokens + image = self.prepare_image(image) + + # Avoid the resizing creating a huge image in a pathological case + ratio = image.shape[1] / image.shape[2] + ratio = max(ratio, 1 / ratio) + original_min_image_size = self.min_image_size + if ratio > 10: + self.min_image_size = int(original_min_image_size * 10 / ratio) + self.transforms = self.build_transform() + + with torch.cuda.device(self.dev): + inference_output = self.inference(image, obj) + + bboxes = inference_output.bbox.cpu().numpy().astype(int) + # bboxes = self.to_left_right_upper_lower(bboxes) + + if ratio > 10: + self.min_image_size = original_min_image_size + self.transforms = self.build_transform() + + bboxes = torch.tensor(bboxes) + + # Convert to [left, lower, right, upper] instead of [left, upper, right, lower] + height = image.shape[-2] + bboxes = torch.stack([bboxes[:, 0], height - bboxes[:, 3], bboxes[:, 2], height - bboxes[:, 1]], dim=1) + + if confidence_threshold is not None: + self.confidence_threshold = original_confidence_threshold + if return_labels: + # subtract 1 because it's 1-indexed for some reason + return bboxes, inference_output.get_field("labels").cpu().numpy() - 1 + return bboxes + + def box_image(self,img,boxes,highlight_best=True): + img1 = img.copy() + draw = ImageDraw.Draw(img1) + for i,box in enumerate(boxes): + if i==0 and highlight_best: + color = 'red' + else: + color = 'blue' + + draw.rectangle(box,outline=color,width=5) + + return img1 + + def parse(self,prog_step): + parse_result = parse_step(prog_step.prog_str) + step_name = parse_result['step_name'] + img_var = parse_result['args']['image'] + obj_name = eval(parse_result['args']['object']) + output_var = parse_result['output_var'] + assert(step_name==self.step_name) + return img_var,obj_name,output_var + + def execute(self,prog_step,inspect=False): + img_var,obj_name,output_var = self.parse(prog_step) + img = prog_step.state[img_var] + + bboxes = self.forward(img,obj_name) + + box_img = self.box_image(img, bboxes) + + objs = dict( + box = bboxes, + category = obj_name, + img = img + ) + + prog_step.state[output_var] = objs + prog_step.state[output_var+'_IMAGE'] = box_img + if inspect: + html_str = self.html(img, box_img, output_var, obj_name) + return objs, html_str + + return objs class LocInterpreter(): step_name = 'LOC' - def __init__(self,thresh=0.1,nms_thresh=0.5): + def __init__(self, thresh=0.1,nms_thresh=0.5): print(f'Registering {self.step_name} step') - self.device = "cuda:0" if torch.cuda.is_available() else "cpu" - self.processor = OwlViTProcessor.from_pretrained( - "google/owlvit-large-patch14") - self.model = OwlViTForObjectDetection.from_pretrained( - "google/owlvit-large-patch14").to(self.device) - self.model.eval() + + model_id = "IDEA-Research/grounding-dino-base" + self.device = "cuda" if torch.cuda.is_available() else "cpu" + + self.processor = AutoProcessor.from_pretrained(model_id) + self.model = AutoModelForZeroShotObjectDetection.from_pretrained( + model_id).to(self.device) + + special_model_id = "facebook/detr-resnet-50" + self.special_processor = DetrImageProcessor.from_pretrained( + special_model_id, revision="no_timm") + self.special_model = DetrForObjectDetection.from_pretrained( + special_model_id, revision="no_timm") + self.thresh = thresh self.nms_thresh = nms_thresh @@ -225,19 +698,22 @@ def normalize_coord(self,bbox,img_size): return [x1,y1,x2,y2] def predict(self,img,obj_name): - encoding = self.processor( - text=[[f'a photo of {obj_name}']], - images=img, - return_tensors='pt') - encoding = {k:v.to(self.device) for k,v in encoding.items()} + prompt = f"a {obj_name}." + + inputs = self.processor( + images=img, text=prompt, return_tensors="pt").to(self.device) + with torch.no_grad(): - outputs = self.model(**encoding) - for k,v in outputs.items(): - if v is not None: - outputs[k] = v.to('cpu') if isinstance(v, torch.Tensor) else v + outputs = self.model(**inputs) + + results = self.processor.post_process_grounded_object_detection( + outputs, + inputs.input_ids, + box_threshold=0.4, + text_threshold=0.3, + target_sizes=[img.size[::-1]] + ) - target_sizes = torch.Tensor([img.size[::-1]]) - results = self.processor.post_process_object_detection(outputs=outputs,threshold=self.thresh,target_sizes=target_sizes) boxes, scores = results[0]["boxes"], results[0]["scores"] boxes = boxes.cpu().detach().numpy().tolist() scores = scores.cpu().detach().numpy().tolist() @@ -256,6 +732,36 @@ def predict(self,img,obj_name): selected_boxes, selected_scores = nms( selected_boxes,selected_scores,self.nms_thresh) return selected_boxes + + def special_predict(self, img): + inputs = self.special_processor( + images=img, return_tensors="pt").to(self.device) + + with torch.no_grad(): + outputs = self.special_model(**inputs) + + target_sizes = torch.tensor([img.size[::-1]]) + results = self.special_processor.post_process_object_detection( + outputs, target_sizes=target_sizes, threshold=0.9)[0] + + boxes, scores = results["boxes"], results["scores"] + boxes = boxes.tolist() + scores = scores.tolist() + if len(boxes)==0: + return [] + + boxes, scores = zip(*sorted(zip(boxes,scores),key=lambda x: x[1],reverse=True)) + selected_boxes = [] + selected_scores = [] + for i in range(len(scores)): + if scores[i] > self.thresh: + coord = self.normalize_coord(boxes[i],img.size) + selected_boxes.append(coord) + selected_scores.append(scores[i]) + + selected_boxes, selected_scores = nms( + selected_boxes,selected_scores,self.nms_thresh) + return selected_boxes def top_box(self,img): w,h = img.size @@ -307,17 +813,26 @@ def execute(self,prog_step,inspect=False): bboxes = [self.left_box(img)] elif obj_name=='RIGHT': bboxes = [self.right_box(img)] + elif obj_name=='object': + bboxes = self.special_predict(img) else: bboxes = self.predict(img,obj_name) box_img = self.box_image(img, bboxes) - prog_step.state[output_var] = bboxes + + objs = dict( + box = bboxes, + category = obj_name, + img = img + ) + + prog_step.state[output_var] = objs prog_step.state[output_var+'_IMAGE'] = box_img if inspect: html_str = self.html(img, box_img, output_var, obj_name) - return bboxes, html_str + return objs, html_str - return bboxes + return objs class Loc2Interpreter(LocInterpreter): @@ -414,6 +929,12 @@ def html(self,img,out_img,output_var,box_img): step_name = html_step_name(self.step_name) box_arg = html_arg_name('bbox') return f"""
{output_var}={step_name}({box_arg}={box_img})={out_img}
""" + + def cropbox(self, box, img): + box = self.expand_box(box, img.size) + out_img = img.crop(box) + + return out_img def execute(self,prog_step,inspect=False): img_var,box_var,output_var = self.parse(prog_step) @@ -1028,7 +1549,7 @@ class ListInterpreter(): def __init__(self): print(f'Registering {self.step_name} step') - openai.api_key = os.getenv("OPENAI_API_KEY") + self.client = OpenAI() def parse(self,prog_step): parse_result = parse_step(prog_step.prog_str) @@ -1040,7 +1561,7 @@ def parse(self,prog_step): return text,list_max,output_var def get_list(self,text,list_max): - response = openai.Completion.create( + response = self.client.Completion.create( model="text-davinci-002", prompt=self.prompt_template.format(list_max=list_max,text=text), temperature=0.7, @@ -1377,4 +1898,12 @@ def register_step_interpreters(dataset='nlvr'): RESULT=ResultInterpreter(), TAG=TagInterpreter(), LOC=Loc2Interpreter(thresh=0.05,nms_thresh=0.3) + ) + elif dataset=='graph': + return dict( + LOC=GLIPLocInterpreter(), + BUILD=BuildInterpreter(), + ADD=ADDInterpreter(), + MERGE=MERGEInterpreter(), + RESULT=ResultInterpreter() ) \ No newline at end of file diff --git a/engine/utils.py b/engine/utils.py index f1bc03f..e6f90e1 100644 --- a/engine/utils.py +++ b/engine/utils.py @@ -3,6 +3,8 @@ import openai import numpy as np import copy +import re +from FlagEmbedding import BGEM3FlagModel from .step_interpreters import register_step_interpreters, parse_step @@ -49,6 +51,8 @@ def execute(self,prog,init_state,inspect=False): class ProgramGenerator(): def __init__(self,prompter,temperature=0.7,top_p=0.5,prob_agg='mean'): openai.api_key = os.getenv("OPENAI_API_KEY") + # print("hello") + # print(os.getenv("OPENAI_API_KEY")) self.prompter = prompter self.temperature = temperature self.top_p = top_p @@ -56,7 +60,7 @@ def __init__(self,prompter,temperature=0.7,top_p=0.5,prob_agg='mean'): def compute_prob(self,response): eos = '<|endoftext|>' - for i,token in enumerate(response.choices[0]['logprobs']['tokens']): + for i,token in enumerate(response.choices[0].logprobs): if token==eos: break @@ -68,22 +72,128 @@ def compute_prob(self,response): raise NotImplementedError return np.exp(agg_fn( - response.choices[0]['logprobs']['token_logprobs'][:i])) + response.choices[0].logprobs.token_logprobs[:i])) def generate(self,inputs): - response = openai.Completion.create( - model="text-davinci-003", - prompt=self.prompter(inputs), + response = openai.chat.completions.create( + model="gpt-4o", + messages=self.prompter.prompt(inputs), temperature=self.temperature, max_tokens=512, top_p=self.top_p, frequency_penalty=0, presence_penalty=0, n=1, - logprobs=1 + logprobs=True ) - prob = self.compute_prob(response) - prog = response.choices[0]['text'].lstrip('\n').rstrip('\n') - return prog, prob + # print(response.choices[0].logprobs) + + # prob = self.compute_prob(response) + answer = response.choices[0].message.content + + return self.prompter.parse(answer) + +class ProgramSynthesis: + + def __init__(self): + self.word_close_model = BGEM3FlagModel('BAAI/bge-m3', + use_fp16=True) + + def parse_objectattribute(self, ObjAttri: str): + + result = {} + + pattern = r'(\w+):\s*([^:\n]+)' + matches = re.findall(pattern, ObjAttri) + + # 解析匹配结果并填充字典 + for match in matches: + key = match[0] + values = match[1].split(', ') + result[key] = values + + return result + + def parse_relation(self, Relation: str): + result = {} + + pattern = r'\(([^)]+)\):\s*([^\n]+)' + matches = re.findall(pattern, Relation) + + for match in matches: + objs = match[0].split(', ') + if(len(objs) == 2): + key = tuple(objs) + value = match[1].strip() + result[key] = value + + return result + + def find_close(self, word: str, obj_dict: dict): + candidates = list(obj_dict.keys()) + + embeddings_1 = self.word_close_model.encode([word], + batch_size=12, + max_length=8192, # If you don't need such a long length, you can set a smaller value to speed up the encoding process. + )['dense_vecs'] + embeddings_2 = self.word_close_model.encode(candidates)['dense_vecs'] + similarity = embeddings_1 @ embeddings_2.T + arr = np.array(similarity[0]) + + return candidates[np.argmax(arr)] + + def synthesis(self, ObjAttri: str, Relation: str): + + ObjAttriDict = self.parse_objectattribute(ObjAttri) + RelationDict = self.parse_relation(Relation) + + print(ObjAttriDict) + print(RelationDict) + + obj_var = {} + code = "" + index = 0 + obj_idx = 0 + + for obj_name, attributes in ObjAttriDict.items(): + code += f'OBJS{obj_idx}=LOC(image=IMAGE, object="{obj_name}")\n' + code += f'GRAPH{index}=BUILD(objects=OBJS{obj_idx})\n' + + index += 1 + obj_idx += 1 + + for attri in attributes: + code += f'GRAPH{index}=ADD(graph=GRAPH{index-1}, attribute={attri})\n' + index += 1 + + obj_var[obj_name] = index - 1 + + final_merge_start = index + + for (obja, objb), relation in RelationDict.items(): + if obja not in obj_var: + obja = self.find_close(obja, obj_var) + if objb not in obj_var: + objb = self.find_close(objb, obj_var) + + a_idx = obj_var[obja] + b_idx = obj_var[objb] + + code += f'GRAPH{index}=MERGE(graphA=GRAPH{a_idx}, graphB=GRAPH{b_idx}, relation={relation})\n' + + index += 1 + + result_index = index - 1 + + if(len(RelationDict) > 1): + for i in range(1, len(RelationDict)): + code += f'GRAPH{index}=MERGE(graphA=GRAPH{index-1}, graphB=GRAPH{final_merge_start+i}, relation={None})\n' + index += 1 + + result_index = index - 1 + + code += f"FINAL=RESULT(var=GRAPH{result_index})" + + return code \ No newline at end of file diff --git a/notebooks/gqa.ipynb b/notebooks/gqa.ipynb index 3beefac..2803b5a 100644 --- a/notebooks/gqa.ipynb +++ b/notebooks/gqa.ipynb @@ -1,56 +1,63 @@ { "cells": [ { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "import os\n", - "import sys\n", - "module_path = os.path.abspath(os.path.join('..'))\n", - "if module_path not in sys.path:\n", - " sys.path.append(module_path)" + "> Note: First time run the code may take 15 minutes." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], - "source": [ - "%env OPENAI_API_KEY=" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: OPENAI_API_KEY=you-key\n", + "Registering LOC step\n", + "Registering COUNT step\n", + "Registering CROP step\n", + "Registering CROP_RIGHTOF step\n", + "Registering CROP_LEFTOF step\n", + "Registering CROP_FRONTOF step\n", + "Registering CROP_INFRONTOF step\n", + "Registering CROP_INFRONT step\n", + "Registering CROP_BEHIND step\n", + "Registering CROP_AHEAD step\n", + "Registering CROP_BELOW step\n", + "Registering CROP_ABOVE step\n", + "Registering VQA step\n", + "Registering EVAL step\n", + "Registering RESULT step\n", + "hello\n", + "you-key\n" + ] + } + ], "source": [ + "import os\n", + "import sys\n", + "module_path = os.path.abspath(os.path.join('..'))\n", + "if module_path not in sys.path:\n", + " sys.path.append(module_path)\n", + "\n", + "# os.environ[\"http_proxy\"] = \"http://172.20.0.113:12798\"\n", + "# os.environ[\"https_proxy\"] = \"http://172.20.0.113:12798\"\n", + "\n", + "%env OPENAI_API_KEY=your-key\n", + "\n", "from PIL import Image\n", "from IPython.core.display import HTML\n", "from functools import partial\n", "\n", "from engine.utils import ProgramGenerator, ProgramInterpreter\n", - "from prompts.gqa import create_prompt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "interpreter = ProgramInterpreter(dataset='gqa')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "from prompts.gqa import create_prompt\n", + "\n", + "interpreter = ProgramInterpreter(dataset='gqa')\n", + "\n", "prompter = partial(create_prompt,method='all')\n", "generator = ProgramGenerator(prompter=prompter)" ] @@ -61,7 +68,7 @@ "metadata": {}, "outputs": [], "source": [ - "image = Image.open('../assets/camel1.png')\n", + "image = Image.open('../assets/COCO_train2014_000000269022.jpg')\n", "image.thumbnail((640,640),Image.Resampling.LANCZOS)\n", "init_state = dict(\n", " IMAGE=image.convert('RGB')\n", @@ -75,7 +82,7 @@ "metadata": {}, "outputs": [], "source": [ - "question = \"How many people or animals are in the image?\"\n", + "question = \"Is the girl touching the ground?\"\n", "# question = \"Are there more animals than people in the image?\"\n", "# question = \"Localize the woman and tell me the color of her dress.\"\n", "# question = \"Find and tell me the name of the animal in the image.\"\n", @@ -83,7 +90,7 @@ "# question = \"How many women are to the left of the camel?\"\n", "# question = \"Is the lamp to the left of the woman lit?\"\n", "# question = \"Is there a sun in the sky?\"\n", - "prog,_ = generator.generate(dict(question=question))\n", + "prog = generator.generate(question)\n", "print(prog)" ] }, @@ -131,7 +138,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.10.14" }, "orig_nbformat": 4 }, diff --git a/notebooks/text b/notebooks/text new file mode 100644 index 0000000..7a487fa --- /dev/null +++ b/notebooks/text @@ -0,0 +1 @@ +ChoiceLogprobs(content=[ChatCompletionTokenLogprob(token='BOX', bytes=[66, 79, 88], logprob=-0.0009105099, top_logprobs=[]), ChatCompletionTokenLogprob(token='0', bytes=[48], logprob=-8.7212284e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='=', bytes=[61], logprob=-2.188868e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='LOC', bytes=[76, 79, 67], logprob=-9.4914985e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='(image', bytes=[40, 105, 109, 97, 103, 101], logprob=-0.000110457004, top_logprobs=[]), ChatCompletionTokenLogprob(token='=', bytes=[61], logprob=-6.456359e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='IMAGE', bytes=[73, 77, 65, 71, 69], logprob=-9.849109e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token=',', bytes=[44], logprob=-2.319992e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='object', bytes=[111, 98, 106, 101, 99, 116], logprob=-0.0012027314, top_logprobs=[]), ChatCompletionTokenLogprob(token="='", bytes=[61, 39], logprob=-2.7848862e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='person', bytes=[112, 101, 114, 115, 111, 110], logprob=-0.2478368, top_logprobs=[]), ChatCompletionTokenLogprob(token="')\n", bytes=[39, 41, 10], logprob=-5.6576944e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='BOX', bytes=[66, 79, 88], logprob=-0.00048131612, top_logprobs=[]), ChatCompletionTokenLogprob(token='1', bytes=[49], logprob=-2.6418418e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='=', bytes=[61], logprob=-5.157039e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='LOC', bytes=[76, 79, 67], logprob=-9.968313e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='(image', bytes=[40, 105, 109, 97, 103, 101], logprob=-7.505351e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='=', bytes=[61], logprob=-3.2736214e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='IMAGE', bytes=[73, 77, 65, 71, 69], logprob=-2.1650272e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token=',', bytes=[44], logprob=-4.465658e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='object', bytes=[111, 98, 106, 101, 99, 116], logprob=-8.017927e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token="='", bytes=[61, 39], logprob=-1.1756368e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='animal', bytes=[97, 110, 105, 109, 97, 108], logprob=-0.004169222, top_logprobs=[]), ChatCompletionTokenLogprob(token="')\n", bytes=[39, 41, 10], logprob=-0.00016802136, top_logprobs=[]), ChatCompletionTokenLogprob(token='ANS', bytes=[65, 78, 83], logprob=-0.0016503064, top_logprobs=[]), ChatCompletionTokenLogprob(token='WER', bytes=[87, 69, 82], logprob=-0.000102827966, top_logprobs=[]), ChatCompletionTokenLogprob(token='0', bytes=[48], logprob=-1.92662e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='=', bytes=[61], logprob=-2.4584822e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='COUNT', bytes=[67, 79, 85, 78, 84], logprob=-5.3596854e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='(box', bytes=[40, 98, 111, 120], logprob=-8.9834764e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='=', bytes=[61], logprob=-2.188868e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='BOX', bytes=[66, 79, 88], logprob=-6.511407e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='0', bytes=[48], logprob=-5.931863e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token=')\n', bytes=[41, 10], logprob=-0.003594599, top_logprobs=[]), ChatCompletionTokenLogprob(token='ANS', bytes=[65, 78, 83], logprob=-7.58424e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='WER', bytes=[87, 69, 82], logprob=-1.1041146e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='1', bytes=[49], logprob=-2.6895234e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='=', bytes=[61], logprob=-1.27099975e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='COUNT', bytes=[67, 79, 85, 78, 84], logprob=-2.4584822e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='(box', bytes=[40, 98, 111, 120], logprob=-5.276243e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='=', bytes=[61], logprob=-4.012684e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='BOX', bytes=[66, 79, 88], logprob=-7.111979e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='1', bytes=[49], logprob=-0.00015812746, top_logprobs=[]), ChatCompletionTokenLogprob(token=')\n', bytes=[41, 10], logprob=-5.347765e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='ANS', bytes=[65, 78, 83], logprob=-0.040827427, top_logprobs=[]), ChatCompletionTokenLogprob(token='WER', bytes=[87, 69, 82], logprob=-2.7133641e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='2', bytes=[50], logprob=-0.012698344, top_logprobs=[]), ChatCompletionTokenLogprob(token='=E', bytes=[61, 69], logprob=-0.020761896, top_logprobs=[]), ChatCompletionTokenLogprob(token='VAL', bytes=[86, 65, 76], logprob=0.0, top_logprobs=[]), ChatCompletionTokenLogprob(token='(expr', bytes=[40, 101, 120, 112, 114], logprob=-0.00011880126, top_logprobs=[]), ChatCompletionTokenLogprob(token='="{', bytes=[61, 34, 123], logprob=-0.0070076943, top_logprobs=[]), ChatCompletionTokenLogprob(token='ANS', bytes=[65, 78, 83], logprob=-6.392203e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='WER', bytes=[87, 69, 82], logprob=-5.4385737e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='0', bytes=[48], logprob=-1.1994775e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='}', bytes=[125], logprob=-7.3458323e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token=' +', bytes=[32, 43], logprob=-0.014847175, top_logprobs=[]), ChatCompletionTokenLogprob(token=' {', bytes=[32, 123], logprob=-2.2723105e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='ANS', bytes=[65, 78, 83], logprob=-5.6769813e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='WER', bytes=[87, 69, 82], logprob=-8.537869e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='1', bytes=[49], logprob=-1.4974867e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='}")\n', bytes=[125, 34, 41, 10], logprob=-3.88156e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='FINAL', bytes=[70, 73, 78, 65, 76], logprob=-3.070975e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='_RESULT', bytes=[95, 82, 69, 83, 85, 76, 84], logprob=-2.2842309e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='=', bytes=[61], logprob=-2.6299214e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='RESULT', bytes=[82, 69, 83, 85, 76, 84], logprob=-7.107425e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='(var', bytes=[40, 118, 97, 114], logprob=-7.58424e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='=', bytes=[61], logprob=-4.8425554e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='ANS', bytes=[65, 78, 83], logprob=-3.0545007e-06, top_logprobs=[]), ChatCompletionTokenLogprob(token='WER', bytes=[87, 69, 82], logprob=-2.5226382e-05, top_logprobs=[]), ChatCompletionTokenLogprob(token='2', bytes=[50], logprob=-0.0005008472, top_logprobs=[]), ChatCompletionTokenLogprob(token=')', bytes=[41], logprob=-1.0280384e-06, top_logprobs=[])]) diff --git a/prompts/gqa.py b/prompts/gqa.py index 1b572fa..1307708 100644 --- a/prompts/gqa.py +++ b/prompts/gqa.py @@ -1,53 +1,145 @@ import random -GQA_CURATED_EXAMPLES=[ -"""Question: Is the vehicle in the top of the image? +GQA_CURATED_MESSAGES=[ +{ +"role": "system", +"content": """You are a visual programmer, you need to generate corresponding code according to user's question(without picture). +Here are some visual functions: +- LOC(image=, object=): it can locate the specific region of the image and return the region box, which is able to detect object. +- CROP(image=, box=): it can crop the image, fitting into the region box. +- COUNT(box=): it can count the number of region boxes. +- CROP_RIGHTOF(image=,box=): it can crop the image, leaving the right area of the region box. +- CROP_LEFTOF(image=,box=): it can crop the image, leaving the left area of the region box. +- CROP_FRONTOF(image=,box=): it can crop the image, leaving the area in front of the region box. +- CROP_INFRONTOF(image=,box=): it can crop the image, leaving the area in the front of the region box. +- CROP_INFRONT(image=,box=): it can crop the image, leaving the area in front the region box. +- CROP_BEHIND(image=,box=): it can crop the image, leaving the area behind the region box. +- CROP_AHEAD(image=,box=): it can crop the image, leaving the area head of the region box. +- CROP_BELOW(image=,box=): it can crop the image, leaving the area below the region box. +- CROP_ABOVE(image=,box=): it can crop the image, leaving the area above the region box. +- VQA(image=,question=): it can answer the question according to the image. +- RESULT(var=): it can use a variable as the final result. + +REMEMBER: You are not allowed to generate no program. Try to handle with the question step by step. +""" +}, +{ +"role": "user", +"content": "Question: Is the vehicle in the top of the image?" +}, +{ +"role": "assistant", +"content": """Reasoning: +1. I need to locate the top area. +2. Find vehicles. +3. Count the number of vehicles. +4. If the number of vehicles is greater than 0, say "yes". +5. Otherwise, say "no". Program: BOX0=LOC(image=IMAGE,object='TOP') IMAGE0=CROP(image=IMAGE,box=BOX0) BOX1=LOC(image=IMAGE0,object='vehicle') ANSWER0=COUNT(box=BOX1) ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 else 'no'") -FINAL_RESULT=RESULT(var=ANSWER1) -""", -"""Question: Are there trains or fences in this scene? +FINAL_RESULT=RESULT(var=ANSWER1)""" +}, +{ +"role": "user", +"content": "Question: Are there trains or fences in this scene?" +}, +{ +"role": "assistant", +"content": """Reasoning: +1. I need to find the trains. +2. I need to find the fences. +3. Count the number of the trains. +4. Count the number of the fences. +5. Plus the numbers of the trains and the fences. +6. If the sum is greater than 0, say "yes". Otherwise, "no". Program: BOX0=LOC(image=IMAGE,object='train') BOX1=LOC(image=IMAGE,object='fence') ANSWER0=COUNT(box=BOX0) ANSWER1=COUNT(box=BOX1) ANSWER2=EVAL(expr="'yes' if {ANSWER0} + {ANSWER1} > 0 else 'no'") -FINAL_RESULT=RESULT(var=ANSWER) -""", -"""Question: Who is carrying the umbrella? +FINAL_RESULT=RESULT(var=ANSWER)""" +}, +{ +"role": "user", +"content": "Question: Who is carrying the umbrella?" +}, +{ +"role": "assistant", +"content": """Reasoning: +1. I need to find the umbrella. +2. Check out the area of the umbrella. +3. Ask who is carrying the umbrella. Program: BOX0=LOC(image=IMAGE,object='umbrella') IMAGE0=CROP(image=IMAGE,box=BOX0) ANSWER0=VQA(image=IMAGE0,question='Who is carrying the umbrella?') -FINAL_RESULT=RESULT(var=ANSWER0) -""", -"""Question: Which place is it? +FINAL_RESULT=RESULT(var=ANSWER0)""" +}, +{ +"role": "user", +"content": "Question: Which place is it?" +}, +{ +"role": "assistant", +"content": """Reasoning: +1. I don't have any object informations. So I decide to simply ask. Program: ANSWER0=VQA(image=IMAGE,question='Which place is it?') -FINAL_RESULT=RESULT(var=ANSWER0) -""", -"""Question: What color is the curtain that is to the right of the mirror? +FINAL_RESULT=RESULT(var=ANSWER0)""" +}, +{ +"role": "user", +"content": "Question: What color is the curtain that is to the right of the mirror?" +}, +{ +"role": "assistant", +"content": """Reasoning: +1. I need to find the mirror. +2. Locate the right area of the mirror. +3. Ask the color of the curtain. +4. Mark the color as the answer. Program: BOX0=LOC(image=IMAGE,object='mirror') IMAGE0=CROP_RIGHTOF(image=IMAGE,box=BOX0) ANSWER0=VQA(image=IMAGE0,question='What color is the curtain?') -FINAL_RESULT=RESULT(var=ANSWER0) -""", -"""Question: Is the pillow in the top part or in the bottom of the picture? +FINAL_RESULT=RESULT(var=ANSWER0)""" +}, +{ +"role": "user", +"content": "Question: Is the pillow in the top part or in the bottom of the picture?" +}, +{ +"role": "assistant", +"content": """Reasoning: +1. I decide to check out the top of the picture. +2. Is there any pillows? +3. If the number of pillows is greater than 0, say "top". +4. Otherwise, say "bottom". Program: BOX0=LOC(image=IMAGE,object='TOP') IMAGE0=CROP(image=IMAGE,box=BOX0) BOX1=LOC(image=IMAGE0,object='pillow') ANSWER0=COUNT(box=BOX1) ANSWER1=EVAL(expr="'top' if {ANSWER0} > 0 else 'bottom'") -FINAL_RESULT=RESULT(var=ANSWER1) -""", -"""Question: Question: Do you see bottles to the right of the wine on the left of the picture? +FINAL_RESULT=RESULT(var=ANSWER1)""" +}, +{ +"role": "user", +"content": "Question: Do you see bottles to the right of the wine on the left of the picture?" +}, +{ +"role": "assistant", +"content": """Reasoning: +1. I need to locate the left of the picture. +2. I need to find the wine on the left of the picture. +3. Check out the right area of the wine on the left of the picture. +4. Is there any bottles? +5. If the number of bottles is greater than 0, say "yes". Otherwise, say "no". Program: BOX0=LOC(image=IMAGE,object='LEFT') IMAGE0=CROP(image=IMAGE,box=BOX0) @@ -56,32 +148,71 @@ BOX2=LOC(image=IMAGE1,object='bottles') ANSWER0=COUNT(box=BOX2) ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 else 'no'") -FINAL_RESULT=RESULT(var=ANSWER1) -""", -"""Question: Is the street light standing behind a truck? +FINAL_RESULT=RESULT(var=ANSWER1)""" +}, +{ +"role": "user", +"content": "Question: Is the street light standing behind a truck?" +}, +{ +"role": "assistant", +"content": """Reasoning: +1. I need to find the truck. +2. Check out the behind of the truck. +3. Count the number of the truck. +4. If the number of the truck is greater than 0, say "yes". Otherwise, say "no". Program: BOX0=LOC(image=IMAGE,object='truck') IMAGE0=CROP_BEHIND(image=IMAGE,box=BOX0) BOX1=LOC(image=IMAGE0,object='street light') ANSWER0=COUNT(box=BOX1) ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 else 'no'") -FINAL_RESULT=RESULT(var=ANSWER1) -""", -"""Question: Which side is the food on? +FINAL_RESULT=RESULT(var=ANSWER1)""" +}, +{ +"role": "user", +"content": "Question: Which side is the food on?" +}, +{ +"role": "assistant", +"content": """Reasoning: +1. I decide to check the right side first. +2. Check out is there any food on the right side. +3. If the number of foof on the right side is greater than zero, say "right". +4. Otherwise, "left". Program: BOX0=LOC(image=IMAGE,object='RIGHT') IMAGE0=CROP(image=IMAGE,box=BOX0) BOX1=LOC(image=IMAGE0,object='food') ANSWER0=COUNT(box=BOX1) ANSWER1=EVAL(expr="'right' if {ANSWER0} > 0 else 'left'") -FINAL_RESULT=RESULT(var=ANSWER1) -""", -"""Question: What do the wetsuit and the sky have in common? +FINAL_RESULT=RESULT(var=ANSWER1)""" +}, +{ +"role": "user", +"content": "Question: What do the wetsuit and the sky have in common?" +}, +{ +"role": "assistant", +"content": """Reasoning: +1. I have no idea of the common attribute, so I decide to simply ask. Program: ANSWER0=VQA(image=IMAGE,question='What do the wetsuit and the sky have in common?') -FINAL_RESULT=RESULT(var=ANSWER0) -""", -"""Question: Do the post and the sign have a different colors? +FINAL_RESULT=RESULT(var=ANSWER0)""" +}, +{ +"role": "user", +"content": "Question: Do the post and the sign have a different colors?" +}, +{ +"role": "assistant", +"content": """Reasoning: +1. I need to find the post. +2. I need to find the sign. +3. I need to ask the color of the post. +4. I need to ask the color of the sign. +5. Check the colors between post and sign. +6. If the colors are the same, say "yes". Otherwise, "no". Program: BOX0=LOC(image=IMAGE,object='post') IMAGE0=CROP(image=IMAGE,box=BOX0) @@ -90,40 +221,86 @@ ANSWER0=VQA(image=IMAGE0,question='What color is the post?') ANSWER1=VQA(image=IMAGE1,question='What color is the sign?') ANSWER2=EVAL(expr="'yes' if {ANSWER0} != {ANSWER1} else 'no'") -FINAL_RESULT=RESULT(var=ANSWER2) -""", -"""Question: Does the traffic cone have white color? +FINAL_RESULT=RESULT(var=ANSWER2)""" +}, +{ +"role": "user", +"content": "Question: Does the traffic cone have white color?" +}, +{ +"role": "assistant", +"content": """Reasoning: +1. I need to find the traffic cone. +2. I need to ask the color of the traffic cone. +3. If the color of the traffic cone is white, I say "yes". Otherwise, "no". Program: BOX0=LOC(image=IMAGE,object='traffic cone') IMAGE0=CROP(image=IMAGE,box=BOX0) ANSWER0=VQA(image=IMAGE0,question='What color is the traffic cone?') ANSWER1=EVAL(expr="'yes' if {ANSWER0} == 'white' else 'no'") -FINAL_RESULT=RESULT(var=ANSWER1) -""", -"""Question: Are these animals of different species? +FINAL_RESULT=RESULT(var=ANSWER1)""" +}, +{ +"role": "user", +"content": "Question: Are these animals of different species?" +}, +{ +"role": "assistant", +"content": """Reasoning: +1. I can't check every animal, so I decide to simply ask. Program: ANSWER0=VQA(image=IMAGE,question='Are these animals of different species?') -FINAL_RESULT=RESULT(var=ANSWER0) -""", -"""Question: Which side of the image is the chair on? +FINAL_RESULT=RESULT(var=ANSWER0)""" +}, +{ +"role": "user", +"content": "Question: Which side of the image is the chair on?" +}, +{ +"role": "assistant", +"content": """Reasoning: +1. First, I check the right side of the image. +2. Is there any chair? +3. If there is a chair, I say "right". Otherwise, "left". Program: BOX0=LOC(image=IMAGE,object='RIGHT') IMAGE0=CROP(image=IMAGE,box=BOX0) BOX1=LOC(image=IMAGE0,object='chair') ANSWER0=COUNT(box=BOX1) ANSWER1=EVAL(expr="'right' if {ANSWER0} > 0 else 'left'") -FINAL_RESULT=RESULT(var=ANSWER1) -""", -"""Question: Do you see any drawers to the left of the plate? +FINAL_RESULT=RESULT(var=ANSWER1)""" +}, +{ +"role": "user", +"content": "Question: Do you see any drawers to the left of the plate?" +}, +{ +"role": "assistant", +"content": """Reasoning: +1. I need to find the plate. +2. I need to look the left of the plate. +3. Is there any drawers? +4. If the number of drawers is greater than 0, I say "yes". Otherwise, "no". Program: BOX0=LOC(image=IMAGE,object='plate') IMAGE0=CROP_LEFTOF(image=IMAGE,box=BOX0) BOX1=LOC(image=IMAGE0,object='drawers') ANSWER0=COUNT(box=BOX1) ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 else 'no'") -FINAL_RESULT=RESULT(var=ANSWER1) -""", -"""Question: Does the mat have the same color as the sky? +FINAL_RESULT=RESULT(var=ANSWER1)""" +}, +{ +"role": "user", +"content": "Question: Does the mat have the same color as the sky?" +}, +{ +"role": "assistant", +"content": """Reasoning: +1. I need to find the sky. +2. I need to find the mat. +3. I need to know the sky's color. +4. I need to know the mat's color. +5. If the two colors are same, I say "yes". Otherwise, "no". Program: BOX0=LOC(image=IMAGE,object='sky') IMAGE0=CROP(image=IMAGE,box=BOX0) @@ -132,57 +309,45 @@ ANSWER0=VQA(image=IMAGE0,question='What color is the sky?') ANSWER1=VQA(image=IMAGE1,question='What color is the mat?') ANSWER2=EVAL(expr="'yes' if {ANSWER0} == {ANSWER1} else 'no'") -FINAL_RESULT=RESULT(var=ANSWER2) -""", -"""Question: Is a cat above the mat? +FINAL_RESULT=RESULT(var=ANSWER2)""" +}, +{ +"role": "user", +"content": "Question: Is a cat above the mat?" +}, +{ +"role": "assistant", +"content": """Reasoning: +1. I need to find the mat. +2. Look above the mat and see if there is a cat. +3. If there is a cat, I say "yes". Otherwise, "no". Program: BOX0=LOC(image=IMAGE,object='mat') IMAGE0=CROP_ABOVE(image=IMAGE,box=BOX0) BOX1=LOC(image=IMAGE0,object='cat') ANSWER0=COUNT(box=BOX1) ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 else 'no'") -FINAL_RESULT=RESULT(var=ANSWER1) -""" -"""Question: Is the cat above a mat? -Program: -BOX0=LOC(image=IMAGE,object='cat') -IMAGE0=CROP_BELOW(image=IMAGE,box=BOX0) -BOX1=LOC(image=IMAGE0,object='mat') -ANSWER0=COUNT(box=BOX1) -ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 and else 'no'") -FINAL_RESULT=RESULT(var=ANSWER1) -""", -"""Question: Is the mat below a cat? -Program: -BOX0=LOC(image=IMAGE,object='mat') -IMAGE0=CROP_ABOVE(image=IMAGE,box=BOX0) -BOX1=LOC(image=IMAGE0,object='cat') -ANSWER0=COUNT(box=BOX1) -ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 else 'no'") -FINAL_RESULT=RESULT(var=ANSWER1) -""", -"""Question: Is a mat below the cat? -Program: -BOX0=LOC(image=IMAGE,object='cat') -IMAGE0=CROP_BELOW(image=IMAGE,box=BOX0) -BOX1=LOC(image=IMAGE0,object='mat') -ANSWER0=COUNT(box=BOX1) -ANSWER1=EVAL(expr="'yes' if {ANSWER0} > 0 and else 'no'") -FINAL_RESULT=RESULT(var=ANSWER1) -""", +FINAL_RESULT=RESULT(var=ANSWER1)""" +}, ] def create_prompt(inputs,num_prompts=8,method='random',seed=42,group=0): - if method=='all': - prompt_examples = GQA_CURATED_EXAMPLES - elif method=='random': - random.seed(seed) - prompt_examples = random.sample(GQA_CURATED_EXAMPLES,num_prompts) - else: - raise NotImplementedError + # if method=='all': + # prompt_examples = GQA_CURATED_EXAMPLES + # elif method=='random': + # random.seed(seed) + # prompt_examples = random.sample(GQA_CURATED_EXAMPLES,num_prompts) + # else: + # raise NotImplementedError + + + + # prompt_examples = '\n'.join(prompt_examples) + # prompt_examples = f'Think step by step to answer the question.\n\n{prompt_examples}' - prompt_examples = '\n'.join(prompt_examples) - prompt_examples = f'Think step by step to answer the question.\n\n{prompt_examples}' + prompt_examples = GQA_CURATED_MESSAGES + prompt_examples.append({"role":"user", "content":f"Question: {inputs}"}) + # print(prompt_examples[35]["content"]) - return prompt_examples + "\nQuestion: {question}\nProgram:".format(**inputs) \ No newline at end of file + return prompt_examples \ No newline at end of file diff --git a/prompts/object_attribute.py b/prompts/object_attribute.py new file mode 100644 index 0000000..ad971ea --- /dev/null +++ b/prompts/object_attribute.py @@ -0,0 +1,102 @@ +import re +from .prompter import Prompter + +class ObjectAttributePrompter(Prompter): + + MESSAGE = [ + { + "role": "system", + "content": """Given a question, identify the entities mentioned in the question and list their corresponding attributes. Each entity should be on a new line, and each attribute should be separated by a space. Here is an example: + +Sure! Here’s a GPT-3.5 prompt that asks for an analysis of entities and their corresponding attributes from a given question: + +--- + +**Prompt:** + +Given a question, identify the entities mentioned in the question and list their corresponding attributes. Each entity should be on a new line, and each attribute should be separated by a space. Here is an example: + +**Question:** "Which kind of clothing is not black?" + +**Answer:** + +``` +clothing: color +``` + +Make sure to maintain the format and accuracy in identifying entities and their attributes for each question. +""" + }, + { + "role": "user", + "content": "**Question**: Who is carrying the umbrella?" + }, + { + "role": "assistant", + "content": """**Answer:** + +``` +people: position +umbrella: position +``` +""" + }, + { + "role": "user", + "content": "**Question**: Which place is it?" + }, + { + "role": "assistant", + "content": """**Answer:** + +``` +object: usage, scene +``` +""" + }, + { + "role": "user", + "content": "**Question**: Does the clothing look large?" + }, + { + "role": "assistant", + "content": """**Answer:** + +``` +clothing: size +``` +""" + }, + { + "role": "user", + "content": "**Question**: Is the chair warm?" + }, + { + "role": "assistant", + "content": """**Answer:** + +``` +chair: feeling(warm or cold) +``` +""" + }, + ] + + def prompt(self, input): + + prompt_examples = self.MESSAGE + prompt_examples.append({"role":"user", "content":f"**Question**: {input}"}) + + # print(prompt_examples[35]["content"]) + + return prompt_examples + + def parse(self, output): + matches = re.findall(r'```([^}]*)```', output) + + if len(matches) > 0: + return matches[0] + + return "" + + \ No newline at end of file diff --git a/prompts/prompter.py b/prompts/prompter.py new file mode 100644 index 0000000..e607f36 --- /dev/null +++ b/prompts/prompter.py @@ -0,0 +1,12 @@ +class Prompter: + + MESSAGE = [] + + def __init__(self): + pass + + def prompt(self, input): + return input + + def parse(self, output): + return output \ No newline at end of file diff --git a/prompts/relation.py b/prompts/relation.py new file mode 100644 index 0000000..2ef1353 --- /dev/null +++ b/prompts/relation.py @@ -0,0 +1,85 @@ +import re +from .prompter import Prompter + +class RelationPrompter(Prompter): + MESSAGE = [ + { + "role": "system", + "content": """Given a question and a list of objects along with their attributes, identify and describe the relationship between the objects mentioned in the question. + +For example, if the question is "Who is carrying the umbrella?" and the objects and attributes are: + +``` +people: position +umbrella: position +``` + +The output should be: + +``` +(people, umbrella): carry +``` + +This indicates that there is a "carry" relationship between "people" and "umbrella". + +**Input:** + +1. A question. +2. A list of objects with their associated attributes. + +**Output:** + +The relationship between the objects mentioned in the question, formatted as (object1, object2): relationship. + """ + }, + { + "role": "user", + "content": """**Question:** "Who is driving the car?" +**Objects and attributes:** +``` +person: role +car: object +```""" + }, + { + "role": "assistant", + "content": """**Output:** +``` +(person, car): drive +```""" + }, + { + "role": "user", + "content": """**Question:** "What animal is eating the grass?" +**Objects and attributes:** +``` +animal: species +grass: object + ```""" + }, + { + "role": "assistant", + "content": """**Output:** +``` +(animal, grass): eat +```""" + }, + ] + + def prompt(self, input): + + prompt_examples = self.MESSAGE + Question = input["Question"] + ObjectAttribute = input["ObjectAttribute"] + prompt_examples.append({"role":"user", "content":f"**Question**: {Question}\n**Objects and attributes:**\n```\n{ObjectAttribute}\n```"}) + + return prompt_examples + + def parse(self, output): + matches = re.findall(r'```([^}]*)```', output) + + if len(matches) > 0: + return matches[0] + + return "" + \ No newline at end of file